1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 * derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD: releng/8.2/sys/i386/i386/mp_machdep.c 215141 2010-11-11 19:39:38Z jhb $");
28
29 #include "opt_apic.h"
30 #include "opt_cpu.h"
31 #include "opt_kstack_pages.h"
32 #include "opt_mp_watchdog.h"
33 #include "opt_pmap.h"
34 #include "opt_sched.h"
35 #include "opt_smp.h"
36
37 #if !defined(lint)
38 #if !defined(SMP)
39 #error How did you get here?
40 #endif
41
42 #ifndef DEV_APIC
43 #error The apic device is required for SMP, add "device apic" to your config file.
44 #endif
45 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
46 #error SMP not supported with CPU_DISABLE_CMPXCHG
47 #endif
48 #endif /* not lint */
49
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/bus.h>
53 #include <sys/cons.h> /* cngetc() */
54 #ifdef GPROF
55 #include <sys/gmon.h>
56 #endif
57 #include <sys/kernel.h>
58 #include <sys/ktr.h>
59 #include <sys/lock.h>
60 #include <sys/malloc.h>
61 #include <sys/memrange.h>
62 #include <sys/mutex.h>
63 #include <sys/pcpu.h>
64 #include <sys/proc.h>
65 #include <sys/sched.h>
66 #include <sys/smp.h>
67 #include <sys/sysctl.h>
68
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_extern.h>
74
75 #include <machine/apicreg.h>
76 #include <machine/clock.h>
77 #include <machine/cputypes.h>
78 #include <machine/mca.h>
79 #include <machine/md_var.h>
80 #include <machine/mp_watchdog.h>
81 #include <machine/pcb.h>
82 #include <machine/psl.h>
83 #include <machine/smp.h>
84 #include <machine/specialreg.h>
85
86 #define WARMBOOT_TARGET 0
87 #define WARMBOOT_OFF (KERNBASE + 0x0467)
88 #define WARMBOOT_SEG (KERNBASE + 0x0469)
89
90 #define CMOS_REG (0x70)
91 #define CMOS_DATA (0x71)
92 #define BIOS_RESET (0x0f)
93 #define BIOS_WARM (0x0a)
94
95 /*
96 * this code MUST be enabled here and in mpboot.s.
97 * it follows the very early stages of AP boot by placing values in CMOS ram.
98 * it NORMALLY will never be needed and thus the primitive method for enabling.
99 *
100 #define CHECK_POINTS
101 */
102
103 #if defined(CHECK_POINTS) && !defined(PC98)
104 #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA))
105 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
106
107 #define CHECK_INIT(D); \
108 CHECK_WRITE(0x34, (D)); \
109 CHECK_WRITE(0x35, (D)); \
110 CHECK_WRITE(0x36, (D)); \
111 CHECK_WRITE(0x37, (D)); \
112 CHECK_WRITE(0x38, (D)); \
113 CHECK_WRITE(0x39, (D));
114
115 #define CHECK_PRINT(S); \
116 printf("%s: %d, %d, %d, %d, %d, %d\n", \
117 (S), \
118 CHECK_READ(0x34), \
119 CHECK_READ(0x35), \
120 CHECK_READ(0x36), \
121 CHECK_READ(0x37), \
122 CHECK_READ(0x38), \
123 CHECK_READ(0x39));
124
125 #else /* CHECK_POINTS */
126
127 #define CHECK_INIT(D)
128 #define CHECK_PRINT(S)
129 #define CHECK_WRITE(A, D)
130
131 #endif /* CHECK_POINTS */
132
133 /* lock region used by kernel profiling */
134 int mcount_lock;
135
136 int mp_naps; /* # of Applications processors */
137 int boot_cpu_id = -1; /* designated BSP */
138
139 extern struct pcpu __pcpu[];
140
141 /* AP uses this during bootstrap. Do not staticize. */
142 char *bootSTK;
143 static int bootAP;
144
145 /* Free these after use */
146 void *bootstacks[MAXCPU];
147 static void *dpcpu;
148
149 /* Hotwire a 0->4MB V==P mapping */
150 extern pt_entry_t *KPTphys;
151
152 struct pcb stoppcbs[MAXCPU];
153
154 /* Variables needed for SMP tlb shootdown. */
155 vm_offset_t smp_tlb_addr1;
156 vm_offset_t smp_tlb_addr2;
157 volatile int smp_tlb_wait;
158
159 #ifdef COUNT_IPIS
160 /* Interrupt counts. */
161 static u_long *ipi_preempt_counts[MAXCPU];
162 static u_long *ipi_ast_counts[MAXCPU];
163 u_long *ipi_invltlb_counts[MAXCPU];
164 u_long *ipi_invlrng_counts[MAXCPU];
165 u_long *ipi_invlpg_counts[MAXCPU];
166 u_long *ipi_invlcache_counts[MAXCPU];
167 u_long *ipi_rendezvous_counts[MAXCPU];
168 u_long *ipi_lazypmap_counts[MAXCPU];
169 #endif
170
171 /*
172 * Local data and functions.
173 */
174
175 static volatile cpumask_t ipi_nmi_pending;
176
177 /* used to hold the AP's until we are ready to release them */
178 static struct mtx ap_boot_mtx;
179
180 /* Set to 1 once we're ready to let the APs out of the pen. */
181 static volatile int aps_ready = 0;
182
183 /*
184 * Store data from cpu_add() until later in the boot when we actually setup
185 * the APs.
186 */
187 struct cpu_info {
188 int cpu_present:1;
189 int cpu_bsp:1;
190 int cpu_disabled:1;
191 int cpu_hyperthread:1;
192 } static cpu_info[MAX_APIC_ID + 1];
193 int cpu_apic_ids[MAXCPU];
194 int apic_cpuids[MAX_APIC_ID + 1];
195
196 /* Holds pending bitmap based IPIs per CPU */
197 static volatile u_int cpu_ipi_pending[MAXCPU];
198
199 static u_int boot_address;
200 static int cpu_logical; /* logical cpus per core */
201 static int cpu_cores; /* cores per package */
202
203 static void assign_cpu_ids(void);
204 static void install_ap_tramp(void);
205 static void set_interrupt_apic_ids(void);
206 static int start_all_aps(void);
207 static int start_ap(int apic_id);
208 static void release_aps(void *dummy);
209
210 static int hlt_logical_cpus;
211 static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */
212 static cpumask_t hyperthreading_cpus_mask;
213 static int hyperthreading_allowed = 1;
214 static struct sysctl_ctx_list logical_cpu_clist;
215
216 static void
217 mem_range_AP_init(void)
218 {
219 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
220 mem_range_softc.mr_op->initAP(&mem_range_softc);
221 }
222
223 static void
224 topo_probe_amd(void)
225 {
226
227 /* AMD processors do not support HTT. */
228 cpu_cores = (amd_feature2 & AMDID2_CMP) != 0 ?
229 (cpu_procinfo2 & AMDID_CMP_CORES) + 1 : 1;
230 cpu_logical = 1;
231 }
232
233 /*
234 * Round up to the next power of two, if necessary, and then
235 * take log2.
236 * Returns -1 if argument is zero.
237 */
238 static __inline int
239 mask_width(u_int x)
240 {
241
242 return (fls(x << (1 - powerof2(x))) - 1);
243 }
244
245 static void
246 topo_probe_0x4(void)
247 {
248 u_int p[4];
249 int pkg_id_bits;
250 int core_id_bits;
251 int max_cores;
252 int max_logical;
253 int id;
254
255 /* Both zero and one here mean one logical processor per package. */
256 max_logical = (cpu_feature & CPUID_HTT) != 0 ?
257 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
258 if (max_logical <= 1)
259 return;
260
261 /*
262 * Because of uniformity assumption we examine only
263 * those logical processors that belong to the same
264 * package as BSP. Further, we count number of
265 * logical processors that belong to the same core
266 * as BSP thus deducing number of threads per core.
267 */
268 cpuid_count(0x04, 0, p);
269 max_cores = ((p[0] >> 26) & 0x3f) + 1;
270 core_id_bits = mask_width(max_logical/max_cores);
271 if (core_id_bits < 0)
272 return;
273 pkg_id_bits = core_id_bits + mask_width(max_cores);
274
275 for (id = 0; id <= MAX_APIC_ID; id++) {
276 /* Check logical CPU availability. */
277 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
278 continue;
279 /* Check if logical CPU has the same package ID. */
280 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
281 continue;
282 cpu_cores++;
283 /* Check if logical CPU has the same package and core IDs. */
284 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
285 cpu_logical++;
286 }
287
288 KASSERT(cpu_cores >= 1 && cpu_logical >= 1,
289 ("topo_probe_0x4 couldn't find BSP"));
290
291 cpu_cores /= cpu_logical;
292 hyperthreading_cpus = cpu_logical;
293 }
294
295 static void
296 topo_probe_0xb(void)
297 {
298 u_int p[4];
299 int bits;
300 int cnt;
301 int i;
302 int logical;
303 int type;
304 int x;
305
306 /* We only support three levels for now. */
307 for (i = 0; i < 3; i++) {
308 cpuid_count(0x0b, i, p);
309
310 /* Fall back if CPU leaf 11 doesn't really exist. */
311 if (i == 0 && p[1] == 0) {
312 topo_probe_0x4();
313 return;
314 }
315
316 bits = p[0] & 0x1f;
317 logical = p[1] &= 0xffff;
318 type = (p[2] >> 8) & 0xff;
319 if (type == 0 || logical == 0)
320 break;
321 /*
322 * Because of uniformity assumption we examine only
323 * those logical processors that belong to the same
324 * package as BSP.
325 */
326 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
327 if (!cpu_info[x].cpu_present ||
328 cpu_info[x].cpu_disabled)
329 continue;
330 if (x >> bits == boot_cpu_id >> bits)
331 cnt++;
332 }
333 if (type == CPUID_TYPE_SMT)
334 cpu_logical = cnt;
335 else if (type == CPUID_TYPE_CORE)
336 cpu_cores = cnt;
337 }
338 if (cpu_logical == 0)
339 cpu_logical = 1;
340 cpu_cores /= cpu_logical;
341 }
342
343 /*
344 * Both topology discovery code and code that consumes topology
345 * information assume top-down uniformity of the topology.
346 * That is, all physical packages must be identical and each
347 * core in a package must have the same number of threads.
348 * Topology information is queried only on BSP, on which this
349 * code runs and for which it can query CPUID information.
350 * Then topology is extrapolated on all packages using the
351 * uniformity assumption.
352 */
353 static void
354 topo_probe(void)
355 {
356 static int cpu_topo_probed = 0;
357
358 if (cpu_topo_probed)
359 return;
360
361 logical_cpus_mask = 0;
362 if (mp_ncpus <= 1)
363 cpu_cores = cpu_logical = 1;
364 else if (cpu_vendor_id == CPU_VENDOR_AMD)
365 topo_probe_amd();
366 else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
367 /*
368 * See Intel(R) 64 Architecture Processor
369 * Topology Enumeration article for details.
370 *
371 * Note that 0x1 <= cpu_high < 4 case should be
372 * compatible with topo_probe_0x4() logic when
373 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
374 * or it should trigger the fallback otherwise.
375 */
376 if (cpu_high >= 0xb)
377 topo_probe_0xb();
378 else if (cpu_high >= 0x1)
379 topo_probe_0x4();
380 }
381
382 /*
383 * Fallback: assume each logical CPU is in separate
384 * physical package. That is, no multi-core, no SMT.
385 */
386 if (cpu_cores == 0 || cpu_logical == 0)
387 cpu_cores = cpu_logical = 1;
388 cpu_topo_probed = 1;
389 }
390
391 struct cpu_group *
392 cpu_topo(void)
393 {
394 int cg_flags;
395
396 /*
397 * Determine whether any threading flags are
398 * necessry.
399 */
400 topo_probe();
401 if (cpu_logical > 1 && hyperthreading_cpus)
402 cg_flags = CG_FLAG_HTT;
403 else if (cpu_logical > 1)
404 cg_flags = CG_FLAG_SMT;
405 else
406 cg_flags = 0;
407 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
408 printf("WARNING: Non-uniform processors.\n");
409 printf("WARNING: Using suboptimal topology.\n");
410 return (smp_topo_none());
411 }
412 /*
413 * No multi-core or hyper-threaded.
414 */
415 if (cpu_logical * cpu_cores == 1)
416 return (smp_topo_none());
417 /*
418 * Only HTT no multi-core.
419 */
420 if (cpu_logical > 1 && cpu_cores == 1)
421 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
422 /*
423 * Only multi-core no HTT.
424 */
425 if (cpu_cores > 1 && cpu_logical == 1)
426 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
427 /*
428 * Both HTT and multi-core.
429 */
430 return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
431 CG_SHARE_L1, cpu_logical, cg_flags));
432 }
433
434
435 /*
436 * Calculate usable address in base memory for AP trampoline code.
437 */
438 u_int
439 mp_bootaddress(u_int basemem)
440 {
441
442 boot_address = trunc_page(basemem); /* round down to 4k boundary */
443 if ((basemem - boot_address) < bootMP_size)
444 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */
445
446 return boot_address;
447 }
448
449 void
450 cpu_add(u_int apic_id, char boot_cpu)
451 {
452
453 if (apic_id > MAX_APIC_ID) {
454 panic("SMP: APIC ID %d too high", apic_id);
455 return;
456 }
457 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
458 apic_id));
459 cpu_info[apic_id].cpu_present = 1;
460 if (boot_cpu) {
461 KASSERT(boot_cpu_id == -1,
462 ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
463 boot_cpu_id));
464 boot_cpu_id = apic_id;
465 cpu_info[apic_id].cpu_bsp = 1;
466 }
467 if (mp_ncpus < MAXCPU)
468 mp_ncpus++;
469 if (bootverbose)
470 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
471 "AP");
472 }
473
474 void
475 cpu_mp_setmaxid(void)
476 {
477
478 mp_maxid = MAXCPU - 1;
479 }
480
481 int
482 cpu_mp_probe(void)
483 {
484
485 /*
486 * Always record BSP in CPU map so that the mbuf init code works
487 * correctly.
488 */
489 all_cpus = 1;
490 if (mp_ncpus == 0) {
491 /*
492 * No CPUs were found, so this must be a UP system. Setup
493 * the variables to represent a system with a single CPU
494 * with an id of 0.
495 */
496 mp_ncpus = 1;
497 return (0);
498 }
499
500 /* At least one CPU was found. */
501 if (mp_ncpus == 1) {
502 /*
503 * One CPU was found, so this must be a UP system with
504 * an I/O APIC.
505 */
506 return (0);
507 }
508
509 /* At least two CPUs were found. */
510 return (1);
511 }
512
513 /*
514 * Initialize the IPI handlers and start up the AP's.
515 */
516 void
517 cpu_mp_start(void)
518 {
519 int i;
520
521 /* Initialize the logical ID to APIC ID table. */
522 for (i = 0; i < MAXCPU; i++) {
523 cpu_apic_ids[i] = -1;
524 cpu_ipi_pending[i] = 0;
525 }
526
527 /* Install an inter-CPU IPI for TLB invalidation */
528 setidt(IPI_INVLTLB, IDTVEC(invltlb),
529 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
530 setidt(IPI_INVLPG, IDTVEC(invlpg),
531 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
532 setidt(IPI_INVLRNG, IDTVEC(invlrng),
533 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
534
535 /* Install an inter-CPU IPI for cache invalidation. */
536 setidt(IPI_INVLCACHE, IDTVEC(invlcache),
537 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
538
539 /* Install an inter-CPU IPI for lazy pmap release */
540 setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
541 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
542
543 /* Install an inter-CPU IPI for all-CPU rendezvous */
544 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
545 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
546
547 /* Install generic inter-CPU IPI handler */
548 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
549 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
550
551 /* Install an inter-CPU IPI for CPU stop/restart */
552 setidt(IPI_STOP, IDTVEC(cpustop),
553 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
554
555
556 /* Set boot_cpu_id if needed. */
557 if (boot_cpu_id == -1) {
558 boot_cpu_id = PCPU_GET(apic_id);
559 cpu_info[boot_cpu_id].cpu_bsp = 1;
560 } else
561 KASSERT(boot_cpu_id == PCPU_GET(apic_id),
562 ("BSP's APIC ID doesn't match boot_cpu_id"));
563
564 /* Probe logical/physical core configuration. */
565 topo_probe();
566
567 assign_cpu_ids();
568
569 /* Start each Application Processor */
570 start_all_aps();
571
572 set_interrupt_apic_ids();
573 }
574
575
576 /*
577 * Print various information about the SMP system hardware and setup.
578 */
579 void
580 cpu_mp_announce(void)
581 {
582 const char *hyperthread;
583 int i;
584
585 printf("FreeBSD/SMP: %d package(s) x %d core(s)",
586 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
587 if (hyperthreading_cpus > 1)
588 printf(" x %d HTT threads", cpu_logical);
589 else if (cpu_logical > 1)
590 printf(" x %d SMT threads", cpu_logical);
591 printf("\n");
592
593 /* List active CPUs first. */
594 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
595 for (i = 1; i < mp_ncpus; i++) {
596 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
597 hyperthread = "/HT";
598 else
599 hyperthread = "";
600 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
601 cpu_apic_ids[i]);
602 }
603
604 /* List disabled CPUs last. */
605 for (i = 0; i <= MAX_APIC_ID; i++) {
606 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
607 continue;
608 if (cpu_info[i].cpu_hyperthread)
609 hyperthread = "/HT";
610 else
611 hyperthread = "";
612 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
613 i);
614 }
615 }
616
617 /*
618 * AP CPU's call this to initialize themselves.
619 */
620 void
621 init_secondary(void)
622 {
623 struct pcpu *pc;
624 vm_offset_t addr;
625 int gsel_tss;
626 int x, myid;
627 u_int cr0;
628
629 /* bootAP is set in start_ap() to our ID. */
630 myid = bootAP;
631
632 /* Get per-cpu data */
633 pc = &__pcpu[myid];
634
635 /* prime data page for it to use */
636 pcpu_init(pc, myid, sizeof(struct pcpu));
637 dpcpu_init(dpcpu, myid);
638 pc->pc_apic_id = cpu_apic_ids[myid];
639 pc->pc_prvspace = pc;
640 pc->pc_curthread = 0;
641
642 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
643 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
644
645 for (x = 0; x < NGDT; x++) {
646 ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
647 }
648
649 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
650 r_gdt.rd_base = (int) &gdt[myid * NGDT];
651 lgdt(&r_gdt); /* does magic intra-segment return */
652
653 lidt(&r_idt);
654
655 lldt(_default_ldt);
656 PCPU_SET(currentldt, _default_ldt);
657
658 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
659 gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
660 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
661 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
662 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
663 PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
664 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
665 ltr(gsel_tss);
666
667 PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
668
669 /*
670 * Set to a known state:
671 * Set by mpboot.s: CR0_PG, CR0_PE
672 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
673 */
674 cr0 = rcr0();
675 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
676 load_cr0(cr0);
677 CHECK_WRITE(0x38, 5);
678
679 /* Disable local APIC just to be sure. */
680 lapic_disable();
681
682 /* signal our startup to the BSP. */
683 mp_naps++;
684 CHECK_WRITE(0x39, 6);
685
686 /* Spin until the BSP releases the AP's. */
687 while (!aps_ready)
688 ia32_pause();
689
690 /* BSP may have changed PTD while we were waiting */
691 invltlb();
692 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
693 invlpg(addr);
694
695 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
696 lidt(&r_idt);
697 #endif
698
699 /* Initialize the PAT MSR if present. */
700 pmap_init_pat();
701
702 /* set up CPU registers and state */
703 cpu_setregs();
704
705 /* set up FPU state on the AP */
706 npxinit();
707
708 /* set up SSE registers */
709 enable_sse();
710
711 #ifdef PAE
712 /* Enable the PTE no-execute bit. */
713 if ((amd_feature & AMDID_NX) != 0) {
714 uint64_t msr;
715
716 msr = rdmsr(MSR_EFER) | EFER_NXE;
717 wrmsr(MSR_EFER, msr);
718 }
719 #endif
720
721 /* A quick check from sanity claus */
722 if (PCPU_GET(apic_id) != lapic_id()) {
723 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
724 printf("SMP: actual apic_id = %d\n", lapic_id());
725 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
726 panic("cpuid mismatch! boom!!");
727 }
728
729 /* Initialize curthread. */
730 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
731 PCPU_SET(curthread, PCPU_GET(idlethread));
732
733 mca_init();
734
735 mtx_lock_spin(&ap_boot_mtx);
736
737 /* Init local apic for irq's */
738 lapic_setup(1);
739
740 /* Set memory range attributes for this CPU to match the BSP */
741 mem_range_AP_init();
742
743 smp_cpus++;
744
745 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
746 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
747
748 /* Determine if we are a logical CPU. */
749 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */
750 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0)
751 logical_cpus_mask |= PCPU_GET(cpumask);
752
753 /* Determine if we are a hyperthread. */
754 if (hyperthreading_cpus > 1 &&
755 PCPU_GET(apic_id) % hyperthreading_cpus != 0)
756 hyperthreading_cpus_mask |= PCPU_GET(cpumask);
757
758 /* Build our map of 'other' CPUs. */
759 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
760
761 if (bootverbose)
762 lapic_dump("AP");
763
764 if (smp_cpus == mp_ncpus) {
765 /* enable IPI's, tlb shootdown, freezes etc */
766 atomic_store_rel_int(&smp_started, 1);
767 smp_active = 1; /* historic */
768 }
769
770 mtx_unlock_spin(&ap_boot_mtx);
771
772 /* wait until all the AP's are up */
773 while (smp_started == 0)
774 ia32_pause();
775
776 /* enter the scheduler */
777 sched_throw(NULL);
778
779 panic("scheduler returned us to %s", __func__);
780 /* NOTREACHED */
781 }
782
783 /*******************************************************************
784 * local functions and data
785 */
786
787 /*
788 * We tell the I/O APIC code about all the CPUs we want to receive
789 * interrupts. If we don't want certain CPUs to receive IRQs we
790 * can simply not tell the I/O APIC code about them in this function.
791 * We also do not tell it about the BSP since it tells itself about
792 * the BSP internally to work with UP kernels and on UP machines.
793 */
794 static void
795 set_interrupt_apic_ids(void)
796 {
797 u_int i, apic_id;
798
799 for (i = 0; i < MAXCPU; i++) {
800 apic_id = cpu_apic_ids[i];
801 if (apic_id == -1)
802 continue;
803 if (cpu_info[apic_id].cpu_bsp)
804 continue;
805 if (cpu_info[apic_id].cpu_disabled)
806 continue;
807
808 /* Don't let hyperthreads service interrupts. */
809 if (hyperthreading_cpus > 1 &&
810 apic_id % hyperthreading_cpus != 0)
811 continue;
812
813 intr_add_cpu(i);
814 }
815 }
816
817 /*
818 * Assign logical CPU IDs to local APICs.
819 */
820 static void
821 assign_cpu_ids(void)
822 {
823 u_int i;
824
825 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
826 &hyperthreading_allowed);
827
828 /* Check for explicitly disabled CPUs. */
829 for (i = 0; i <= MAX_APIC_ID; i++) {
830 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
831 continue;
832
833 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
834 cpu_info[i].cpu_hyperthread = 1;
835 #if defined(SCHED_ULE)
836 /*
837 * Don't use HT CPU if it has been disabled by a
838 * tunable.
839 */
840 if (hyperthreading_allowed == 0) {
841 cpu_info[i].cpu_disabled = 1;
842 continue;
843 }
844 #endif
845 }
846
847 /* Don't use this CPU if it has been disabled by a tunable. */
848 if (resource_disabled("lapic", i)) {
849 cpu_info[i].cpu_disabled = 1;
850 continue;
851 }
852 }
853
854 /*
855 * Assign CPU IDs to local APIC IDs and disable any CPUs
856 * beyond MAXCPU. CPU 0 is always assigned to the BSP.
857 *
858 * To minimize confusion for userland, we attempt to number
859 * CPUs such that all threads and cores in a package are
860 * grouped together. For now we assume that the BSP is always
861 * the first thread in a package and just start adding APs
862 * starting with the BSP's APIC ID.
863 */
864 mp_ncpus = 1;
865 cpu_apic_ids[0] = boot_cpu_id;
866 apic_cpuids[boot_cpu_id] = 0;
867 for (i = boot_cpu_id + 1; i != boot_cpu_id;
868 i == MAX_APIC_ID ? i = 0 : i++) {
869 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
870 cpu_info[i].cpu_disabled)
871 continue;
872
873 if (mp_ncpus < MAXCPU) {
874 cpu_apic_ids[mp_ncpus] = i;
875 apic_cpuids[i] = mp_ncpus;
876 mp_ncpus++;
877 } else
878 cpu_info[i].cpu_disabled = 1;
879 }
880 KASSERT(mp_maxid >= mp_ncpus - 1,
881 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
882 mp_ncpus));
883 }
884
885 /*
886 * start each AP in our list
887 */
888 /* Lowest 1MB is already mapped: don't touch*/
889 #define TMPMAP_START 1
890 static int
891 start_all_aps(void)
892 {
893 #ifndef PC98
894 u_char mpbiosreason;
895 #endif
896 uintptr_t kptbase;
897 u_int32_t mpbioswarmvec;
898 int apic_id, cpu, i;
899
900 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
901
902 /* install the AP 1st level boot code */
903 install_ap_tramp();
904
905 /* save the current value of the warm-start vector */
906 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
907 #ifndef PC98
908 outb(CMOS_REG, BIOS_RESET);
909 mpbiosreason = inb(CMOS_DATA);
910 #endif
911
912 /* set up temporary P==V mapping for AP boot */
913 /* XXX this is a hack, we should boot the AP on its own stack/PTD */
914
915 kptbase = (uintptr_t)(void *)KPTphys;
916 for (i = TMPMAP_START; i < NKPT; i++)
917 PTD[i] = (pd_entry_t)(PG_V | PG_RW |
918 ((kptbase + i * PAGE_SIZE) & PG_FRAME));
919 invltlb();
920
921 /* start each AP */
922 for (cpu = 1; cpu < mp_ncpus; cpu++) {
923 apic_id = cpu_apic_ids[cpu];
924
925 /* allocate and set up a boot stack data page */
926 bootstacks[cpu] =
927 (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
928 dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
929 /* setup a vector to our boot code */
930 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
931 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
932 #ifndef PC98
933 outb(CMOS_REG, BIOS_RESET);
934 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
935 #endif
936
937 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4;
938 bootAP = cpu;
939
940 /* attempt to start the Application Processor */
941 CHECK_INIT(99); /* setup checkpoints */
942 if (!start_ap(apic_id)) {
943 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
944 CHECK_PRINT("trace"); /* show checkpoints */
945 /* better panic as the AP may be running loose */
946 printf("panic y/n? [y] ");
947 if (cngetc() != 'n')
948 panic("bye-bye");
949 }
950 CHECK_PRINT("trace"); /* show checkpoints */
951
952 all_cpus |= (1 << cpu); /* record AP in CPU map */
953 }
954
955 /* build our map of 'other' CPUs */
956 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
957
958 /* restore the warmstart vector */
959 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
960
961 #ifndef PC98
962 outb(CMOS_REG, BIOS_RESET);
963 outb(CMOS_DATA, mpbiosreason);
964 #endif
965
966 /* Undo V==P hack from above */
967 for (i = TMPMAP_START; i < NKPT; i++)
968 PTD[i] = 0;
969 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
970
971 /* number of APs actually started */
972 return mp_naps;
973 }
974
975 /*
976 * load the 1st level AP boot code into base memory.
977 */
978
979 /* targets for relocation */
980 extern void bigJump(void);
981 extern void bootCodeSeg(void);
982 extern void bootDataSeg(void);
983 extern void MPentry(void);
984 extern u_int MP_GDT;
985 extern u_int mp_gdtbase;
986
987 static void
988 install_ap_tramp(void)
989 {
990 int x;
991 int size = *(int *) ((u_long) & bootMP_size);
992 vm_offset_t va = boot_address + KERNBASE;
993 u_char *src = (u_char *) ((u_long) bootMP);
994 u_char *dst = (u_char *) va;
995 u_int boot_base = (u_int) bootMP;
996 u_int8_t *dst8;
997 u_int16_t *dst16;
998 u_int32_t *dst32;
999
1000 KASSERT (size <= PAGE_SIZE,
1001 ("'size' do not fit into PAGE_SIZE, as expected."));
1002 pmap_kenter(va, boot_address);
1003 pmap_invalidate_page (kernel_pmap, va);
1004 for (x = 0; x < size; ++x)
1005 *dst++ = *src++;
1006
1007 /*
1008 * modify addresses in code we just moved to basemem. unfortunately we
1009 * need fairly detailed info about mpboot.s for this to work. changes
1010 * to mpboot.s might require changes here.
1011 */
1012
1013 /* boot code is located in KERNEL space */
1014 dst = (u_char *) va;
1015
1016 /* modify the lgdt arg */
1017 dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
1018 *dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
1019
1020 /* modify the ljmp target for MPentry() */
1021 dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
1022 *dst32 = ((u_int) MPentry - KERNBASE);
1023
1024 /* modify the target for boot code segment */
1025 dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
1026 dst8 = (u_int8_t *) (dst16 + 1);
1027 *dst16 = (u_int) boot_address & 0xffff;
1028 *dst8 = ((u_int) boot_address >> 16) & 0xff;
1029
1030 /* modify the target for boot data segment */
1031 dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
1032 dst8 = (u_int8_t *) (dst16 + 1);
1033 *dst16 = (u_int) boot_address & 0xffff;
1034 *dst8 = ((u_int) boot_address >> 16) & 0xff;
1035 }
1036
1037 /*
1038 * This function starts the AP (application processor) identified
1039 * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
1040 * to accomplish this. This is necessary because of the nuances
1041 * of the different hardware we might encounter. It isn't pretty,
1042 * but it seems to work.
1043 */
1044 static int
1045 start_ap(int apic_id)
1046 {
1047 int vector, ms;
1048 int cpus;
1049
1050 /* calculate the vector */
1051 vector = (boot_address >> 12) & 0xff;
1052
1053 /* used as a watchpoint to signal AP startup */
1054 cpus = mp_naps;
1055
1056 /*
1057 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
1058 * and running the target CPU. OR this INIT IPI might be latched (P5
1059 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1060 * ignored.
1061 */
1062
1063 /* do an INIT IPI: assert RESET */
1064 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1065 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1066
1067 /* wait for pending status end */
1068 lapic_ipi_wait(-1);
1069
1070 /* do an INIT IPI: deassert RESET */
1071 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
1072 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
1073
1074 /* wait for pending status end */
1075 DELAY(10000); /* wait ~10mS */
1076 lapic_ipi_wait(-1);
1077
1078 /*
1079 * next we do a STARTUP IPI: the previous INIT IPI might still be
1080 * latched, (P5 bug) this 1st STARTUP would then terminate
1081 * immediately, and the previously started INIT IPI would continue. OR
1082 * the previous INIT IPI has already run. and this STARTUP IPI will
1083 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1084 * will run.
1085 */
1086
1087 /* do a STARTUP IPI */
1088 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1089 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1090 vector, apic_id);
1091 lapic_ipi_wait(-1);
1092 DELAY(200); /* wait ~200uS */
1093
1094 /*
1095 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1096 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1097 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1098 * recognized after hardware RESET or INIT IPI.
1099 */
1100
1101 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1102 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1103 vector, apic_id);
1104 lapic_ipi_wait(-1);
1105 DELAY(200); /* wait ~200uS */
1106
1107 /* Wait up to 5 seconds for it to start. */
1108 for (ms = 0; ms < 5000; ms++) {
1109 if (mp_naps > cpus)
1110 return 1; /* return SUCCESS */
1111 DELAY(1000);
1112 }
1113 return 0; /* return FAILURE */
1114 }
1115
1116 #ifdef COUNT_XINVLTLB_HITS
1117 u_int xhits_gbl[MAXCPU];
1118 u_int xhits_pg[MAXCPU];
1119 u_int xhits_rng[MAXCPU];
1120 SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
1121 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1122 sizeof(xhits_gbl), "IU", "");
1123 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1124 sizeof(xhits_pg), "IU", "");
1125 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1126 sizeof(xhits_rng), "IU", "");
1127
1128 u_int ipi_global;
1129 u_int ipi_page;
1130 u_int ipi_range;
1131 u_int ipi_range_size;
1132 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1133 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1134 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1135 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1136 0, "");
1137
1138 u_int ipi_masked_global;
1139 u_int ipi_masked_page;
1140 u_int ipi_masked_range;
1141 u_int ipi_masked_range_size;
1142 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
1143 &ipi_masked_global, 0, "");
1144 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
1145 &ipi_masked_page, 0, "");
1146 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
1147 &ipi_masked_range, 0, "");
1148 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
1149 &ipi_masked_range_size, 0, "");
1150 #endif /* COUNT_XINVLTLB_HITS */
1151
1152 /*
1153 * Flush the TLB on all other CPU's
1154 */
1155 static void
1156 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1157 {
1158 u_int ncpu;
1159
1160 ncpu = mp_ncpus - 1; /* does not shootdown self */
1161 if (ncpu < 1)
1162 return; /* no other cpus */
1163 if (!(read_eflags() & PSL_I))
1164 panic("%s: interrupts disabled", __func__);
1165 mtx_lock_spin(&smp_ipi_mtx);
1166 smp_tlb_addr1 = addr1;
1167 smp_tlb_addr2 = addr2;
1168 atomic_store_rel_int(&smp_tlb_wait, 0);
1169 ipi_all_but_self(vector);
1170 while (smp_tlb_wait < ncpu)
1171 ia32_pause();
1172 mtx_unlock_spin(&smp_ipi_mtx);
1173 }
1174
1175 static void
1176 smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1177 {
1178 int ncpu, othercpus;
1179
1180 othercpus = mp_ncpus - 1;
1181 if (mask == (u_int)-1) {
1182 ncpu = othercpus;
1183 if (ncpu < 1)
1184 return;
1185 } else {
1186 mask &= ~PCPU_GET(cpumask);
1187 if (mask == 0)
1188 return;
1189 ncpu = bitcount32(mask);
1190 if (ncpu > othercpus) {
1191 /* XXX this should be a panic offence */
1192 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1193 ncpu, othercpus);
1194 ncpu = othercpus;
1195 }
1196 /* XXX should be a panic, implied by mask == 0 above */
1197 if (ncpu < 1)
1198 return;
1199 }
1200 if (!(read_eflags() & PSL_I))
1201 panic("%s: interrupts disabled", __func__);
1202 mtx_lock_spin(&smp_ipi_mtx);
1203 smp_tlb_addr1 = addr1;
1204 smp_tlb_addr2 = addr2;
1205 atomic_store_rel_int(&smp_tlb_wait, 0);
1206 if (mask == (u_int)-1)
1207 ipi_all_but_self(vector);
1208 else
1209 ipi_selected(mask, vector);
1210 while (smp_tlb_wait < ncpu)
1211 ia32_pause();
1212 mtx_unlock_spin(&smp_ipi_mtx);
1213 }
1214
1215 /*
1216 * Send an IPI to specified CPU handling the bitmap logic.
1217 */
1218 static void
1219 ipi_send_cpu(int cpu, u_int ipi)
1220 {
1221 u_int bitmap, old_pending, new_pending;
1222
1223 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
1224
1225 if (IPI_IS_BITMAPED(ipi)) {
1226 bitmap = 1 << ipi;
1227 ipi = IPI_BITMAP_VECTOR;
1228 do {
1229 old_pending = cpu_ipi_pending[cpu];
1230 new_pending = old_pending | bitmap;
1231 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
1232 old_pending, new_pending));
1233 if (old_pending)
1234 return;
1235 }
1236 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1237 }
1238
1239 void
1240 smp_cache_flush(void)
1241 {
1242
1243 if (smp_started)
1244 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1245 }
1246
1247 void
1248 smp_invltlb(void)
1249 {
1250
1251 if (smp_started) {
1252 smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1253 #ifdef COUNT_XINVLTLB_HITS
1254 ipi_global++;
1255 #endif
1256 }
1257 }
1258
1259 void
1260 smp_invlpg(vm_offset_t addr)
1261 {
1262
1263 if (smp_started) {
1264 smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1265 #ifdef COUNT_XINVLTLB_HITS
1266 ipi_page++;
1267 #endif
1268 }
1269 }
1270
1271 void
1272 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1273 {
1274
1275 if (smp_started) {
1276 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1277 #ifdef COUNT_XINVLTLB_HITS
1278 ipi_range++;
1279 ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1280 #endif
1281 }
1282 }
1283
1284 void
1285 smp_masked_invltlb(cpumask_t mask)
1286 {
1287
1288 if (smp_started) {
1289 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1290 #ifdef COUNT_XINVLTLB_HITS
1291 ipi_masked_global++;
1292 #endif
1293 }
1294 }
1295
1296 void
1297 smp_masked_invlpg(cpumask_t mask, vm_offset_t addr)
1298 {
1299
1300 if (smp_started) {
1301 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1302 #ifdef COUNT_XINVLTLB_HITS
1303 ipi_masked_page++;
1304 #endif
1305 }
1306 }
1307
1308 void
1309 smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2)
1310 {
1311
1312 if (smp_started) {
1313 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1314 #ifdef COUNT_XINVLTLB_HITS
1315 ipi_masked_range++;
1316 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
1317 #endif
1318 }
1319 }
1320
1321 void
1322 ipi_bitmap_handler(struct trapframe frame)
1323 {
1324 int cpu = PCPU_GET(cpuid);
1325 u_int ipi_bitmap;
1326
1327 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1328
1329 if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1330 #ifdef COUNT_IPIS
1331 (*ipi_preempt_counts[cpu])++;
1332 #endif
1333 sched_preempt(curthread);
1334 }
1335
1336 if (ipi_bitmap & (1 << IPI_AST)) {
1337 #ifdef COUNT_IPIS
1338 (*ipi_ast_counts[cpu])++;
1339 #endif
1340 /* Nothing to do for AST */
1341 }
1342
1343 if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1344 hardclockintr(&frame);
1345
1346 if (ipi_bitmap & (1 << IPI_STATCLOCK))
1347 statclockintr(&frame);
1348
1349 if (ipi_bitmap & (1 << IPI_PROFCLOCK))
1350 profclockintr(&frame);
1351 }
1352
1353 /*
1354 * send an IPI to a set of cpus.
1355 */
1356 void
1357 ipi_selected(cpumask_t cpus, u_int ipi)
1358 {
1359 int cpu;
1360
1361 /*
1362 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1363 * of help in order to understand what is the source.
1364 * Set the mask of receiving CPUs for this purpose.
1365 */
1366 if (ipi == IPI_STOP_HARD)
1367 atomic_set_int(&ipi_nmi_pending, cpus);
1368
1369 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1370 while ((cpu = ffs(cpus)) != 0) {
1371 cpu--;
1372 cpus &= ~(1 << cpu);
1373 ipi_send_cpu(cpu, ipi);
1374 }
1375 }
1376
1377 /*
1378 * send an IPI to a specific CPU.
1379 */
1380 void
1381 ipi_cpu(int cpu, u_int ipi)
1382 {
1383
1384 /*
1385 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1386 * of help in order to understand what is the source.
1387 * Set the mask of receiving CPUs for this purpose.
1388 */
1389 if (ipi == IPI_STOP_HARD)
1390 atomic_set_int(&ipi_nmi_pending, 1 << cpu);
1391
1392 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1393 ipi_send_cpu(cpu, ipi);
1394 }
1395
1396 /*
1397 * send an IPI to all CPUs EXCEPT myself
1398 */
1399 void
1400 ipi_all_but_self(u_int ipi)
1401 {
1402
1403 if (IPI_IS_BITMAPED(ipi)) {
1404 ipi_selected(PCPU_GET(other_cpus), ipi);
1405 return;
1406 }
1407
1408 /*
1409 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1410 * of help in order to understand what is the source.
1411 * Set the mask of receiving CPUs for this purpose.
1412 */
1413 if (ipi == IPI_STOP_HARD)
1414 atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus));
1415 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1416 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1417 }
1418
1419 int
1420 ipi_nmi_handler()
1421 {
1422 cpumask_t cpumask;
1423
1424 /*
1425 * As long as there is not a simple way to know about a NMI's
1426 * source, if the bitmask for the current CPU is present in
1427 * the global pending bitword an IPI_STOP_HARD has been issued
1428 * and should be handled.
1429 */
1430 cpumask = PCPU_GET(cpumask);
1431 if ((ipi_nmi_pending & cpumask) == 0)
1432 return (1);
1433
1434 atomic_clear_int(&ipi_nmi_pending, cpumask);
1435 cpustop_handler();
1436 return (0);
1437 }
1438
1439 /*
1440 * Handle an IPI_STOP by saving our current context and spinning until we
1441 * are resumed.
1442 */
1443 void
1444 cpustop_handler(void)
1445 {
1446 cpumask_t cpumask;
1447 u_int cpu;
1448
1449 cpu = PCPU_GET(cpuid);
1450 cpumask = PCPU_GET(cpumask);
1451
1452 savectx(&stoppcbs[cpu]);
1453
1454 /* Indicate that we are stopped */
1455 atomic_set_int(&stopped_cpus, cpumask);
1456
1457 /* Wait for restart */
1458 while (!(started_cpus & cpumask))
1459 ia32_pause();
1460
1461 atomic_clear_int(&started_cpus, cpumask);
1462 atomic_clear_int(&stopped_cpus, cpumask);
1463
1464 if (cpu == 0 && cpustop_restartfunc != NULL) {
1465 cpustop_restartfunc();
1466 cpustop_restartfunc = NULL;
1467 }
1468 }
1469
1470 /*
1471 * This is called once the rest of the system is up and running and we're
1472 * ready to let the AP's out of the pen.
1473 */
1474 static void
1475 release_aps(void *dummy __unused)
1476 {
1477
1478 if (mp_ncpus == 1)
1479 return;
1480 atomic_store_rel_int(&aps_ready, 1);
1481 while (smp_started == 0)
1482 ia32_pause();
1483 }
1484 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1485
1486 static int
1487 sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1488 {
1489 cpumask_t mask;
1490 int error;
1491
1492 mask = hlt_cpus_mask;
1493 error = sysctl_handle_int(oidp, &mask, 0, req);
1494 if (error || !req->newptr)
1495 return (error);
1496
1497 if (logical_cpus_mask != 0 &&
1498 (mask & logical_cpus_mask) == logical_cpus_mask)
1499 hlt_logical_cpus = 1;
1500 else
1501 hlt_logical_cpus = 0;
1502
1503 if (! hyperthreading_allowed)
1504 mask |= hyperthreading_cpus_mask;
1505
1506 if ((mask & all_cpus) == all_cpus)
1507 mask &= ~(1<<0);
1508 hlt_cpus_mask = mask;
1509 return (error);
1510 }
1511 SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1512 0, 0, sysctl_hlt_cpus, "IU",
1513 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2.");
1514
1515 static int
1516 sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1517 {
1518 int disable, error;
1519
1520 disable = hlt_logical_cpus;
1521 error = sysctl_handle_int(oidp, &disable, 0, req);
1522 if (error || !req->newptr)
1523 return (error);
1524
1525 if (disable)
1526 hlt_cpus_mask |= logical_cpus_mask;
1527 else
1528 hlt_cpus_mask &= ~logical_cpus_mask;
1529
1530 if (! hyperthreading_allowed)
1531 hlt_cpus_mask |= hyperthreading_cpus_mask;
1532
1533 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1534 hlt_cpus_mask &= ~(1<<0);
1535
1536 hlt_logical_cpus = disable;
1537 return (error);
1538 }
1539
1540 static int
1541 sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1542 {
1543 int allowed, error;
1544
1545 allowed = hyperthreading_allowed;
1546 error = sysctl_handle_int(oidp, &allowed, 0, req);
1547 if (error || !req->newptr)
1548 return (error);
1549
1550 #ifdef SCHED_ULE
1551 /*
1552 * SCHED_ULE doesn't allow enabling/disabling HT cores at
1553 * run-time.
1554 */
1555 if (allowed != hyperthreading_allowed)
1556 return (ENOTSUP);
1557 return (error);
1558 #endif
1559
1560 if (allowed)
1561 hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1562 else
1563 hlt_cpus_mask |= hyperthreading_cpus_mask;
1564
1565 if (logical_cpus_mask != 0 &&
1566 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1567 hlt_logical_cpus = 1;
1568 else
1569 hlt_logical_cpus = 0;
1570
1571 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1572 hlt_cpus_mask &= ~(1<<0);
1573
1574 hyperthreading_allowed = allowed;
1575 return (error);
1576 }
1577
1578 static void
1579 cpu_hlt_setup(void *dummy __unused)
1580 {
1581
1582 if (logical_cpus_mask != 0) {
1583 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1584 &hlt_logical_cpus);
1585 sysctl_ctx_init(&logical_cpu_clist);
1586 SYSCTL_ADD_PROC(&logical_cpu_clist,
1587 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1588 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1589 sysctl_hlt_logical_cpus, "IU", "");
1590 SYSCTL_ADD_UINT(&logical_cpu_clist,
1591 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1592 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1593 &logical_cpus_mask, 0, "");
1594
1595 if (hlt_logical_cpus)
1596 hlt_cpus_mask |= logical_cpus_mask;
1597
1598 /*
1599 * If necessary for security purposes, force
1600 * hyperthreading off, regardless of the value
1601 * of hlt_logical_cpus.
1602 */
1603 if (hyperthreading_cpus_mask) {
1604 SYSCTL_ADD_PROC(&logical_cpu_clist,
1605 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1606 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1607 0, 0, sysctl_hyperthreading_allowed, "IU", "");
1608 if (! hyperthreading_allowed)
1609 hlt_cpus_mask |= hyperthreading_cpus_mask;
1610 }
1611 }
1612 }
1613 SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1614
1615 int
1616 mp_grab_cpu_hlt(void)
1617 {
1618 cpumask_t mask;
1619 #ifdef MP_WATCHDOG
1620 u_int cpuid;
1621 #endif
1622 int retval;
1623
1624 mask = PCPU_GET(cpumask);
1625 #ifdef MP_WATCHDOG
1626 cpuid = PCPU_GET(cpuid);
1627 ap_watchdog(cpuid);
1628 #endif
1629
1630 retval = 0;
1631 while (mask & hlt_cpus_mask) {
1632 retval = 1;
1633 __asm __volatile("sti; hlt" : : : "memory");
1634 }
1635 return (retval);
1636 }
1637
1638 #ifdef COUNT_IPIS
1639 /*
1640 * Setup interrupt counters for IPI handlers.
1641 */
1642 static void
1643 mp_ipi_intrcnt(void *dummy)
1644 {
1645 char buf[64];
1646 int i;
1647
1648 for (i = 0; i < mp_maxid; i++) {
1649 if (CPU_ABSENT(i))
1650 continue;
1651 snprintf(buf, sizeof(buf), "cpu%d: invltlb", i);
1652 intrcnt_add(buf, &ipi_invltlb_counts[i]);
1653 snprintf(buf, sizeof(buf), "cpu%d: invlrng", i);
1654 intrcnt_add(buf, &ipi_invlrng_counts[i]);
1655 snprintf(buf, sizeof(buf), "cpu%d: invlpg", i);
1656 intrcnt_add(buf, &ipi_invlpg_counts[i]);
1657 snprintf(buf, sizeof(buf), "cpu%d: preempt", i);
1658 intrcnt_add(buf, &ipi_preempt_counts[i]);
1659 snprintf(buf, sizeof(buf), "cpu%d: ast", i);
1660 intrcnt_add(buf, &ipi_ast_counts[i]);
1661 snprintf(buf, sizeof(buf), "cpu%d: rendezvous", i);
1662 intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1663 snprintf(buf, sizeof(buf), "cpu%d: lazypmap", i);
1664 intrcnt_add(buf, &ipi_lazypmap_counts[i]);
1665 }
1666 }
1667 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1668 #endif
Cache object: 2ee6cd93e2d04565d38b0d8cc39c560b
|