1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 * derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include "opt_cpu.h"
31 #include "opt_kstack_pages.h"
32 #include "opt_mp_watchdog.h"
33 #include "opt_sched.h"
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/bus.h>
38 #ifdef GPROF
39 #include <sys/gmon.h>
40 #endif
41 #include <sys/kernel.h>
42 #include <sys/ktr.h>
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/memrange.h>
46 #include <sys/mutex.h>
47 #include <sys/pcpu.h>
48 #include <sys/proc.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/sysctl.h>
52
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_kern.h>
57 #include <vm/vm_extern.h>
58
59 #include <machine/apicreg.h>
60 #include <machine/md_var.h>
61 #include <machine/mp_watchdog.h>
62 #include <machine/pcb.h>
63 #include <machine/psl.h>
64 #include <machine/smp.h>
65 #include <machine/specialreg.h>
66 #include <machine/tss.h>
67
68 #define WARMBOOT_TARGET 0
69 #define WARMBOOT_OFF (KERNBASE + 0x0467)
70 #define WARMBOOT_SEG (KERNBASE + 0x0469)
71
72 #define CMOS_REG (0x70)
73 #define CMOS_DATA (0x71)
74 #define BIOS_RESET (0x0f)
75 #define BIOS_WARM (0x0a)
76
77 /* lock region used by kernel profiling */
78 int mcount_lock;
79
80 int mp_naps; /* # of Applications processors */
81 int boot_cpu_id = -1; /* designated BSP */
82 extern int nkpt;
83
84 extern struct pcpu __pcpu[];
85
86 /*
87 * CPU topology map datastructures for HTT.
88 */
89 static struct cpu_group mp_groups[MAXCPU];
90 static struct cpu_top mp_top;
91
92 /* AP uses this during bootstrap. Do not staticize. */
93 char *bootSTK;
94 static int bootAP;
95
96 /* Free these after use */
97 void *bootstacks[MAXCPU];
98
99 /* Temporary holder for double fault stack */
100 char *doublefault_stack;
101
102 /* Hotwire a 0->4MB V==P mapping */
103 extern pt_entry_t *KPTphys;
104
105 /* SMP page table page */
106 extern pt_entry_t *SMPpt;
107
108 struct pcb stoppcbs[MAXCPU];
109
110 /* Variables needed for SMP tlb shootdown. */
111 vm_offset_t smp_tlb_addr1;
112 vm_offset_t smp_tlb_addr2;
113 volatile int smp_tlb_wait;
114
115 extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
116
117 #ifdef STOP_NMI
118 volatile cpumask_t ipi_nmi_pending;
119
120 static void ipi_nmi_selected(u_int32_t cpus);
121 #endif
122
123 /*
124 * Local data and functions.
125 */
126
127 #ifdef STOP_NMI
128 /*
129 * Provide an alternate method of stopping other CPUs. If another CPU has
130 * disabled interrupts the conventional STOP IPI will be blocked. This
131 * NMI-based stop should get through in that case.
132 */
133 static int stop_cpus_with_nmi = 1;
134 SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
135 &stop_cpus_with_nmi, 0, "");
136 TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
137 #else
138 #define stop_cpus_with_nmi 0
139 #endif
140
141 static u_int logical_cpus;
142
143 /* used to hold the AP's until we are ready to release them */
144 static struct mtx ap_boot_mtx;
145
146 /* Set to 1 once we're ready to let the APs out of the pen. */
147 static volatile int aps_ready = 0;
148
149 /*
150 * Store data from cpu_add() until later in the boot when we actually setup
151 * the APs.
152 */
153 struct cpu_info {
154 int cpu_present:1;
155 int cpu_bsp:1;
156 int cpu_disabled:1;
157 } static cpu_info[MAX_APIC_ID + 1];
158 int cpu_apic_ids[MAXCPU];
159
160 /* Holds pending bitmap based IPIs per CPU */
161 static volatile u_int cpu_ipi_pending[MAXCPU];
162
163 static u_int boot_address;
164
165 static void assign_cpu_ids(void);
166 static void set_interrupt_apic_ids(void);
167 static int start_all_aps(void);
168 static int start_ap(int apic_id);
169 static void release_aps(void *dummy);
170
171 static int hlt_logical_cpus;
172 static u_int hyperthreading_cpus;
173 static cpumask_t hyperthreading_cpus_mask;
174 static int hyperthreading_allowed = 1;
175 static struct sysctl_ctx_list logical_cpu_clist;
176 static u_int bootMP_size;
177
178 static void
179 mem_range_AP_init(void)
180 {
181 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
182 mem_range_softc.mr_op->initAP(&mem_range_softc);
183 }
184
185 void
186 mp_topology(void)
187 {
188 struct cpu_group *group;
189 int apic_id;
190 int groups;
191 int cpu;
192
193 /* Build the smp_topology map. */
194 /* Nothing to do if there is no HTT support. */
195 if (hyperthreading_cpus <= 1)
196 return;
197 group = &mp_groups[0];
198 groups = 1;
199 for (cpu = 0, apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) {
200 if (!cpu_info[apic_id].cpu_present)
201 continue;
202 /*
203 * If the current group has members and we're not a logical
204 * cpu, create a new group.
205 */
206 if (group->cg_count != 0 &&
207 (apic_id % hyperthreading_cpus) == 0) {
208 group++;
209 groups++;
210 }
211 group->cg_count++;
212 group->cg_mask |= 1 << cpu;
213 cpu++;
214 }
215
216 mp_top.ct_count = groups;
217 mp_top.ct_group = mp_groups;
218 smp_topology = &mp_top;
219 }
220
221 /*
222 * Calculate usable address in base memory for AP trampoline code.
223 */
224 u_int
225 mp_bootaddress(u_int basemem)
226 {
227
228 bootMP_size = mptramp_end - mptramp_start;
229 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */
230 if (((basemem * 1024) - boot_address) < bootMP_size)
231 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */
232 /* 3 levels of page table pages */
233 mptramp_pagetables = boot_address - (PAGE_SIZE * 3);
234
235 return mptramp_pagetables;
236 }
237
238 void
239 cpu_add(u_int apic_id, char boot_cpu)
240 {
241
242 if (apic_id > MAX_APIC_ID) {
243 panic("SMP: APIC ID %d too high", apic_id);
244 return;
245 }
246 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
247 apic_id));
248 cpu_info[apic_id].cpu_present = 1;
249 if (boot_cpu) {
250 KASSERT(boot_cpu_id == -1,
251 ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
252 boot_cpu_id));
253 boot_cpu_id = apic_id;
254 cpu_info[apic_id].cpu_bsp = 1;
255 }
256 if (mp_ncpus < MAXCPU) {
257 mp_ncpus++;
258 mp_maxid = mp_ncpus -1;
259 }
260 if (bootverbose)
261 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
262 "AP");
263 }
264
265 void
266 cpu_mp_setmaxid(void)
267 {
268
269 /*
270 * mp_maxid should be already set by calls to cpu_add().
271 * Just sanity check its value here.
272 */
273 if (mp_ncpus == 0)
274 KASSERT(mp_maxid == 0,
275 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
276 else if (mp_ncpus == 1)
277 mp_maxid = 0;
278 else
279 KASSERT(mp_maxid >= mp_ncpus - 1,
280 ("%s: counters out of sync: max %d, count %d", __func__,
281 mp_maxid, mp_ncpus));
282 }
283
284 int
285 cpu_mp_probe(void)
286 {
287
288 /*
289 * Always record BSP in CPU map so that the mbuf init code works
290 * correctly.
291 */
292 all_cpus = 1;
293 if (mp_ncpus == 0) {
294 /*
295 * No CPUs were found, so this must be a UP system. Setup
296 * the variables to represent a system with a single CPU
297 * with an id of 0.
298 */
299 mp_ncpus = 1;
300 return (0);
301 }
302
303 /* At least one CPU was found. */
304 if (mp_ncpus == 1) {
305 /*
306 * One CPU was found, so this must be a UP system with
307 * an I/O APIC.
308 */
309 mp_maxid = 0;
310 return (0);
311 }
312
313 /* At least two CPUs were found. */
314 return (1);
315 }
316
317 /*
318 * Initialize the IPI handlers and start up the AP's.
319 */
320 void
321 cpu_mp_start(void)
322 {
323 int i;
324 u_int threads_per_cache, p[4];
325
326 /* Initialize the logical ID to APIC ID table. */
327 for (i = 0; i < MAXCPU; i++) {
328 cpu_apic_ids[i] = -1;
329 cpu_ipi_pending[i] = 0;
330 }
331
332 /* Install an inter-CPU IPI for TLB invalidation */
333 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
334 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
335 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
336
337 /* Install an inter-CPU IPI for cache invalidation. */
338 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
339
340 /* Install an inter-CPU IPI for all-CPU rendezvous */
341 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
342
343 /* Install generic inter-CPU IPI handler */
344 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
345 SDT_SYSIGT, SEL_KPL, 0);
346
347 /* Install an inter-CPU IPI for CPU stop/restart */
348 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
349
350 /* Set boot_cpu_id if needed. */
351 if (boot_cpu_id == -1) {
352 boot_cpu_id = PCPU_GET(apic_id);
353 cpu_info[boot_cpu_id].cpu_bsp = 1;
354 } else
355 KASSERT(boot_cpu_id == PCPU_GET(apic_id),
356 ("BSP's APIC ID doesn't match boot_cpu_id"));
357 cpu_apic_ids[0] = boot_cpu_id;
358
359 assign_cpu_ids();
360
361 /* Start each Application Processor */
362 start_all_aps();
363
364 /* Setup the initial logical CPUs info. */
365 logical_cpus = logical_cpus_mask = 0;
366 if (cpu_feature & CPUID_HTT)
367 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
368
369 /*
370 * Work out if hyperthreading is *really* enabled. This
371 * is made really ugly by the fact that processors lie: Dual
372 * core processors claim to be hyperthreaded even when they're
373 * not, presumably because they want to be treated the same
374 * way as HTT with respect to per-cpu software licensing.
375 * At the time of writing (May 12, 2005) the only hyperthreaded
376 * cpus are from Intel, and Intel's dual-core processors can be
377 * identified via the "deterministic cache parameters" cpuid
378 * calls.
379 */
380 /*
381 * First determine if this is an Intel processor which claims
382 * to have hyperthreading support.
383 */
384 if ((cpu_feature & CPUID_HTT) &&
385 (strcmp(cpu_vendor, "GenuineIntel") == 0)) {
386 /*
387 * If the "deterministic cache parameters" cpuid calls
388 * are available, use them.
389 */
390 if (cpu_high >= 4) {
391 /* Ask the processor about the L1 cache. */
392 for (i = 0; i < 1; i++) {
393 cpuid_count(4, i, p);
394 threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
395 if (hyperthreading_cpus < threads_per_cache)
396 hyperthreading_cpus = threads_per_cache;
397 if ((p[0] & 0x1f) == 0)
398 break;
399 }
400 }
401
402 /*
403 * If the deterministic cache parameters are not
404 * available, or if no caches were reported to exist,
405 * just accept what the HTT flag indicated.
406 */
407 if (hyperthreading_cpus == 0)
408 hyperthreading_cpus = logical_cpus;
409 }
410
411 set_interrupt_apic_ids();
412
413 /* Last, setup the cpu topology now that we have probed CPUs */
414 mp_topology();
415 }
416
417
418 /*
419 * Print various information about the SMP system hardware and setup.
420 */
421 void
422 cpu_mp_announce(void)
423 {
424 int i, x;
425
426 /* List CPUs */
427 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
428 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
429 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
430 continue;
431 if (cpu_info[x].cpu_disabled)
432 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x);
433 else {
434 KASSERT(i < mp_ncpus,
435 ("mp_ncpus and actual cpus are out of whack"));
436 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
437 }
438 }
439 }
440
441 /*
442 * AP CPU's call this to initialize themselves.
443 */
444 void
445 init_secondary(void)
446 {
447 struct pcpu *pc;
448 u_int64_t msr, cr0;
449 int cpu, gsel_tss, x;
450 struct region_descriptor ap_gdt;
451
452 /* Set by the startup code for us to use */
453 cpu = bootAP;
454
455 /* Init tss */
456 common_tss[cpu] = common_tss[0];
457 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */
458 common_tss[cpu].tss_iobase = sizeof(struct amd64tss);
459 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
460
461 /* Prepare private GDT */
462 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
463 ssdtosyssd(&gdt_segs[GPROC0_SEL],
464 (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]);
465 for (x = 0; x < NGDT; x++) {
466 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
467 ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]);
468 }
469 ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
470 ap_gdt.rd_base = (long) &gdt[NGDT * cpu];
471 lgdt(&ap_gdt); /* does magic intra-segment return */
472
473 /* Get per-cpu data */
474 pc = &__pcpu[cpu];
475
476 /* prime data page for it to use */
477 pcpu_init(pc, cpu, sizeof(struct pcpu));
478 pc->pc_apic_id = cpu_apic_ids[cpu];
479 pc->pc_prvspace = pc;
480 pc->pc_curthread = 0;
481 pc->pc_tssp = &common_tss[cpu];
482 pc->pc_rsp0 = 0;
483 pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
484
485 wrmsr(MSR_FSBASE, 0); /* User value */
486 wrmsr(MSR_GSBASE, (u_int64_t)pc);
487 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
488
489 lidt(&r_idt);
490
491 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
492 ltr(gsel_tss);
493
494 /*
495 * Set to a known state:
496 * Set by mpboot.s: CR0_PG, CR0_PE
497 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
498 */
499 cr0 = rcr0();
500 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
501 load_cr0(cr0);
502
503 /* Set up the fast syscall stuff */
504 msr = rdmsr(MSR_EFER) | EFER_SCE;
505 wrmsr(MSR_EFER, msr);
506 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
507 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
508 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
509 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
510 wrmsr(MSR_STAR, msr);
511 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
512
513 /* Disable local APIC just to be sure. */
514 lapic_disable();
515
516 /* signal our startup to the BSP. */
517 mp_naps++;
518
519 /* Spin until the BSP releases the AP's. */
520 while (!aps_ready)
521 ia32_pause();
522
523 /* Initialize the PAT MSR. */
524 pmap_init_pat();
525
526 /* set up CPU registers and state */
527 cpu_setregs();
528
529 /* set up SSE/NX registers */
530 initializecpu();
531
532 /* set up FPU state on the AP */
533 fpuinit();
534
535 /* A quick check from sanity claus */
536 if (PCPU_GET(apic_id) != lapic_id()) {
537 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
538 printf("SMP: actual apic_id = %d\n", lapic_id());
539 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
540 panic("cpuid mismatch! boom!!");
541 }
542
543 /* Initialize curthread. */
544 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
545 PCPU_SET(curthread, PCPU_GET(idlethread));
546
547 mtx_lock_spin(&ap_boot_mtx);
548
549 /* Init local apic for irq's */
550 lapic_setup(1);
551
552 /* Set memory range attributes for this CPU to match the BSP */
553 mem_range_AP_init();
554
555 smp_cpus++;
556
557 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
558 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
559
560 /* Determine if we are a logical CPU. */
561 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
562 logical_cpus_mask |= PCPU_GET(cpumask);
563
564 /* Determine if we are a hyperthread. */
565 if (hyperthreading_cpus > 1 &&
566 PCPU_GET(apic_id) % hyperthreading_cpus != 0)
567 hyperthreading_cpus_mask |= PCPU_GET(cpumask);
568
569 /* Build our map of 'other' CPUs. */
570 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
571
572 if (bootverbose)
573 lapic_dump("AP");
574
575 if (smp_cpus == mp_ncpus) {
576 /* enable IPI's, tlb shootdown, freezes etc */
577 atomic_store_rel_int(&smp_started, 1);
578 smp_active = 1; /* historic */
579 }
580
581 /*
582 * Enable global pages TLB extension
583 * This also implicitly flushes the TLB
584 */
585
586 load_cr4(rcr4() | CR4_PGE);
587
588 mtx_unlock_spin(&ap_boot_mtx);
589
590 /* wait until all the AP's are up */
591 while (smp_started == 0)
592 ia32_pause();
593
594 sched_throw(NULL);
595
596 panic("scheduler returned us to %s", __func__);
597 /* NOTREACHED */
598 }
599
600 /*******************************************************************
601 * local functions and data
602 */
603
604 /*
605 * We tell the I/O APIC code about all the CPUs we want to receive
606 * interrupts. If we don't want certain CPUs to receive IRQs we
607 * can simply not tell the I/O APIC code about them in this function.
608 * We also do not tell it about the BSP since it tells itself about
609 * the BSP internally to work with UP kernels and on UP machines.
610 */
611 static void
612 set_interrupt_apic_ids(void)
613 {
614 u_int i, apic_id;
615
616 for (i = 0; i < MAXCPU; i++) {
617 apic_id = cpu_apic_ids[i];
618 if (apic_id == -1)
619 continue;
620 if (cpu_info[apic_id].cpu_bsp)
621 continue;
622 if (cpu_info[apic_id].cpu_disabled)
623 continue;
624
625 /* Don't let hyperthreads service interrupts. */
626 if (hyperthreading_cpus > 1 &&
627 apic_id % hyperthreading_cpus != 0)
628 continue;
629
630 intr_add_cpu(i);
631 }
632 }
633
634 /*
635 * Assign logical CPU IDs to local APICs.
636 */
637 static void
638 assign_cpu_ids(void)
639 {
640 u_int i;
641
642 /* Check for explicitly disabled CPUs. */
643 for (i = 0; i <= MAX_APIC_ID; i++) {
644 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
645 continue;
646
647 /* Don't use this CPU if it has been disabled by a tunable. */
648 if (resource_disabled("lapic", i)) {
649 cpu_info[i].cpu_disabled = 1;
650 continue;
651 }
652 }
653
654 /*
655 * Assign CPU IDs to local APIC IDs and disable any CPUs
656 * beyond MAXCPU. CPU 0 has already been assigned to the BSP,
657 * so we only have to assign IDs for APs.
658 */
659 mp_ncpus = 1;
660 for (i = 0; i <= MAX_APIC_ID; i++) {
661 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
662 cpu_info[i].cpu_disabled)
663 continue;
664
665 if (mp_ncpus < MAXCPU) {
666 cpu_apic_ids[mp_ncpus] = i;
667 mp_ncpus++;
668 } else
669 cpu_info[i].cpu_disabled = 1;
670 }
671 KASSERT(mp_maxid >= mp_ncpus - 1,
672 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
673 mp_ncpus));
674 }
675
676 /*
677 * start each AP in our list
678 */
679 static int
680 start_all_aps(void)
681 {
682 vm_offset_t va = boot_address + KERNBASE;
683 u_int64_t *pt4, *pt3, *pt2;
684 u_int32_t mpbioswarmvec;
685 int apic_id, cpu, i;
686 u_char mpbiosreason;
687
688 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
689
690 /* install the AP 1st level boot code */
691 pmap_kenter(va, boot_address);
692 pmap_invalidate_page(kernel_pmap, va);
693 bcopy(mptramp_start, (void *)va, bootMP_size);
694
695 /* Locate the page tables, they'll be below the trampoline */
696 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE);
697 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
698 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
699
700 /* Create the initial 1GB replicated page tables */
701 for (i = 0; i < 512; i++) {
702 /* Each slot of the level 4 pages points to the same level 3 page */
703 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
704 pt4[i] |= PG_V | PG_RW | PG_U;
705
706 /* Each slot of the level 3 pages points to the same level 2 page */
707 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
708 pt3[i] |= PG_V | PG_RW | PG_U;
709
710 /* The level 2 page slots are mapped with 2MB pages for 1GB. */
711 pt2[i] = i * (2 * 1024 * 1024);
712 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
713 }
714
715 /* save the current value of the warm-start vector */
716 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
717 outb(CMOS_REG, BIOS_RESET);
718 mpbiosreason = inb(CMOS_DATA);
719
720 /* setup a vector to our boot code */
721 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
722 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
723 outb(CMOS_REG, BIOS_RESET);
724 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
725
726 /* start each AP */
727 for (cpu = 1; cpu < mp_ncpus; cpu++) {
728 apic_id = cpu_apic_ids[cpu];
729
730 /* allocate and set up an idle stack data page */
731 bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
732 doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
733
734 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
735 bootAP = cpu;
736
737 /* attempt to start the Application Processor */
738 if (!start_ap(apic_id)) {
739 /* restore the warmstart vector */
740 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
741 panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
742 }
743
744 all_cpus |= (1 << cpu); /* record AP in CPU map */
745 }
746
747 /* build our map of 'other' CPUs */
748 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
749
750 /* restore the warmstart vector */
751 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
752
753 outb(CMOS_REG, BIOS_RESET);
754 outb(CMOS_DATA, mpbiosreason);
755
756 /* number of APs actually started */
757 return mp_naps;
758 }
759
760
761 /*
762 * This function starts the AP (application processor) identified
763 * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
764 * to accomplish this. This is necessary because of the nuances
765 * of the different hardware we might encounter. It isn't pretty,
766 * but it seems to work.
767 */
768 static int
769 start_ap(int apic_id)
770 {
771 int vector, ms;
772 int cpus;
773
774 /* calculate the vector */
775 vector = (boot_address >> 12) & 0xff;
776
777 /* used as a watchpoint to signal AP startup */
778 cpus = mp_naps;
779
780 /*
781 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
782 * and running the target CPU. OR this INIT IPI might be latched (P5
783 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
784 * ignored.
785 */
786
787 /* do an INIT IPI: assert RESET */
788 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
789 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
790
791 /* wait for pending status end */
792 lapic_ipi_wait(-1);
793
794 /* do an INIT IPI: deassert RESET */
795 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
796 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
797
798 /* wait for pending status end */
799 DELAY(10000); /* wait ~10mS */
800 lapic_ipi_wait(-1);
801
802 /*
803 * next we do a STARTUP IPI: the previous INIT IPI might still be
804 * latched, (P5 bug) this 1st STARTUP would then terminate
805 * immediately, and the previously started INIT IPI would continue. OR
806 * the previous INIT IPI has already run. and this STARTUP IPI will
807 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
808 * will run.
809 */
810
811 /* do a STARTUP IPI */
812 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
813 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
814 vector, apic_id);
815 lapic_ipi_wait(-1);
816 DELAY(200); /* wait ~200uS */
817
818 /*
819 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
820 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
821 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
822 * recognized after hardware RESET or INIT IPI.
823 */
824
825 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
826 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
827 vector, apic_id);
828 lapic_ipi_wait(-1);
829 DELAY(200); /* wait ~200uS */
830
831 /* Wait up to 5 seconds for it to start. */
832 for (ms = 0; ms < 5000; ms++) {
833 if (mp_naps > cpus)
834 return 1; /* return SUCCESS */
835 DELAY(1000);
836 }
837 return 0; /* return FAILURE */
838 }
839
840 /*
841 * Flush the TLB on all other CPU's
842 */
843 static void
844 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
845 {
846 u_int ncpu;
847
848 ncpu = mp_ncpus - 1; /* does not shootdown self */
849 if (ncpu < 1)
850 return; /* no other cpus */
851 if (!(read_rflags() & PSL_I))
852 panic("%s: interrupts disabled", __func__);
853 mtx_lock_spin(&smp_ipi_mtx);
854 smp_tlb_addr1 = addr1;
855 smp_tlb_addr2 = addr2;
856 atomic_store_rel_int(&smp_tlb_wait, 0);
857 ipi_all_but_self(vector);
858 while (smp_tlb_wait < ncpu)
859 ia32_pause();
860 mtx_unlock_spin(&smp_ipi_mtx);
861 }
862
863 static void
864 smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
865 {
866 int ncpu, othercpus;
867
868 othercpus = mp_ncpus - 1;
869 if (mask == (u_int)-1) {
870 ncpu = othercpus;
871 if (ncpu < 1)
872 return;
873 } else {
874 mask &= ~PCPU_GET(cpumask);
875 if (mask == 0)
876 return;
877 ncpu = bitcount32(mask);
878 if (ncpu > othercpus) {
879 /* XXX this should be a panic offence */
880 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
881 ncpu, othercpus);
882 ncpu = othercpus;
883 }
884 /* XXX should be a panic, implied by mask == 0 above */
885 if (ncpu < 1)
886 return;
887 }
888 if (!(read_rflags() & PSL_I))
889 panic("%s: interrupts disabled", __func__);
890 mtx_lock_spin(&smp_ipi_mtx);
891 smp_tlb_addr1 = addr1;
892 smp_tlb_addr2 = addr2;
893 atomic_store_rel_int(&smp_tlb_wait, 0);
894 if (mask == (u_int)-1)
895 ipi_all_but_self(vector);
896 else
897 ipi_selected(mask, vector);
898 while (smp_tlb_wait < ncpu)
899 ia32_pause();
900 mtx_unlock_spin(&smp_ipi_mtx);
901 }
902
903 void
904 smp_cache_flush(void)
905 {
906
907 if (smp_started)
908 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
909 }
910
911 void
912 smp_invltlb(void)
913 {
914
915 if (smp_started) {
916 smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
917 }
918 }
919
920 void
921 smp_invlpg(vm_offset_t addr)
922 {
923
924 if (smp_started)
925 smp_tlb_shootdown(IPI_INVLPG, addr, 0);
926 }
927
928 void
929 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
930 {
931
932 if (smp_started) {
933 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
934 }
935 }
936
937 void
938 smp_masked_invltlb(u_int mask)
939 {
940
941 if (smp_started) {
942 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
943 }
944 }
945
946 void
947 smp_masked_invlpg(u_int mask, vm_offset_t addr)
948 {
949
950 if (smp_started) {
951 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
952 }
953 }
954
955 void
956 smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
957 {
958
959 if (smp_started) {
960 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
961 }
962 }
963
964 void
965 ipi_bitmap_handler(struct trapframe frame)
966 {
967 int cpu = PCPU_GET(cpuid);
968 u_int ipi_bitmap;
969
970 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
971
972 if (ipi_bitmap & (1 << IPI_PREEMPT)) {
973 struct thread *running_thread = curthread;
974 thread_lock(running_thread);
975 if (running_thread->td_critnest > 1)
976 running_thread->td_owepreempt = 1;
977 else
978 mi_switch(SW_INVOL | SW_PREEMPT, NULL);
979 thread_unlock(running_thread);
980 }
981
982 /* Nothing to do for AST */
983 }
984
985 /*
986 * send an IPI to a set of cpus.
987 */
988 void
989 ipi_selected(u_int32_t cpus, u_int ipi)
990 {
991 int cpu;
992 u_int bitmap = 0;
993 u_int old_pending;
994 u_int new_pending;
995
996 if (IPI_IS_BITMAPED(ipi)) {
997 bitmap = 1 << ipi;
998 ipi = IPI_BITMAP_VECTOR;
999 }
1000
1001 #ifdef STOP_NMI
1002 if (ipi == IPI_STOP && stop_cpus_with_nmi) {
1003 ipi_nmi_selected(cpus);
1004 return;
1005 }
1006 #endif
1007 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1008 while ((cpu = ffs(cpus)) != 0) {
1009 cpu--;
1010 cpus &= ~(1 << cpu);
1011
1012 KASSERT(cpu_apic_ids[cpu] != -1,
1013 ("IPI to non-existent CPU %d", cpu));
1014
1015 if (bitmap) {
1016 do {
1017 old_pending = cpu_ipi_pending[cpu];
1018 new_pending = old_pending | bitmap;
1019 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1020
1021 if (old_pending)
1022 continue;
1023 }
1024
1025 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1026 }
1027
1028 }
1029
1030 /*
1031 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
1032 */
1033 void
1034 ipi_all(u_int ipi)
1035 {
1036
1037 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1038 ipi_selected(all_cpus, ipi);
1039 return;
1040 }
1041 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1042 lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL);
1043 }
1044
1045 /*
1046 * send an IPI to all CPUs EXCEPT myself
1047 */
1048 void
1049 ipi_all_but_self(u_int ipi)
1050 {
1051
1052 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1053 ipi_selected(PCPU_GET(other_cpus), ipi);
1054 return;
1055 }
1056 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1057 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1058 }
1059
1060 /*
1061 * send an IPI to myself
1062 */
1063 void
1064 ipi_self(u_int ipi)
1065 {
1066
1067 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1068 ipi_selected(PCPU_GET(cpumask), ipi);
1069 return;
1070 }
1071 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1072 lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF);
1073 }
1074
1075 #ifdef STOP_NMI
1076 /*
1077 * send NMI IPI to selected CPUs
1078 */
1079
1080 #define BEFORE_SPIN 1000000
1081
1082 void
1083 ipi_nmi_selected(u_int32_t cpus)
1084 {
1085 int cpu;
1086 register_t icrlo;
1087
1088 icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1089 | APIC_TRIGMOD_EDGE;
1090
1091 CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1092
1093 atomic_set_int(&ipi_nmi_pending, cpus);
1094
1095 while ((cpu = ffs(cpus)) != 0) {
1096 cpu--;
1097 cpus &= ~(1 << cpu);
1098
1099 KASSERT(cpu_apic_ids[cpu] != -1,
1100 ("IPI NMI to non-existent CPU %d", cpu));
1101
1102 /* Wait for an earlier IPI to finish. */
1103 if (!lapic_ipi_wait(BEFORE_SPIN))
1104 panic("ipi_nmi_selected: previous IPI has not cleared");
1105
1106 lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1107 }
1108 }
1109
1110 int
1111 ipi_nmi_handler(void)
1112 {
1113 int cpumask = PCPU_GET(cpumask);
1114
1115 if (!(ipi_nmi_pending & cpumask))
1116 return 1;
1117
1118 atomic_clear_int(&ipi_nmi_pending, cpumask);
1119 cpustop_handler();
1120 return 0;
1121 }
1122
1123 #endif /* STOP_NMI */
1124
1125 /*
1126 * Handle an IPI_STOP by saving our current context and spinning until we
1127 * are resumed.
1128 */
1129 void
1130 cpustop_handler(void)
1131 {
1132 int cpu = PCPU_GET(cpuid);
1133 int cpumask = PCPU_GET(cpumask);
1134
1135 savectx(&stoppcbs[cpu]);
1136
1137 /* Indicate that we are stopped */
1138 atomic_set_int(&stopped_cpus, cpumask);
1139
1140 /* Wait for restart */
1141 while (!(started_cpus & cpumask))
1142 ia32_pause();
1143
1144 atomic_clear_int(&started_cpus, cpumask);
1145 atomic_clear_int(&stopped_cpus, cpumask);
1146
1147 if (cpu == 0 && cpustop_restartfunc != NULL) {
1148 cpustop_restartfunc();
1149 cpustop_restartfunc = NULL;
1150 }
1151 }
1152
1153 /*
1154 * This is called once the rest of the system is up and running and we're
1155 * ready to let the AP's out of the pen.
1156 */
1157 static void
1158 release_aps(void *dummy __unused)
1159 {
1160
1161 if (mp_ncpus == 1)
1162 return;
1163 atomic_store_rel_int(&aps_ready, 1);
1164 while (smp_started == 0)
1165 ia32_pause();
1166 }
1167 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1168
1169 static int
1170 sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1171 {
1172 u_int mask;
1173 int error;
1174
1175 mask = hlt_cpus_mask;
1176 error = sysctl_handle_int(oidp, &mask, 0, req);
1177 if (error || !req->newptr)
1178 return (error);
1179
1180 if (logical_cpus_mask != 0 &&
1181 (mask & logical_cpus_mask) == logical_cpus_mask)
1182 hlt_logical_cpus = 1;
1183 else
1184 hlt_logical_cpus = 0;
1185
1186 if (! hyperthreading_allowed)
1187 mask |= hyperthreading_cpus_mask;
1188
1189 if ((mask & all_cpus) == all_cpus)
1190 mask &= ~(1<<0);
1191 hlt_cpus_mask = mask;
1192 return (error);
1193 }
1194 SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1195 0, 0, sysctl_hlt_cpus, "IU",
1196 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2.");
1197
1198 static int
1199 sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1200 {
1201 int disable, error;
1202
1203 disable = hlt_logical_cpus;
1204 error = sysctl_handle_int(oidp, &disable, 0, req);
1205 if (error || !req->newptr)
1206 return (error);
1207
1208 if (disable)
1209 hlt_cpus_mask |= logical_cpus_mask;
1210 else
1211 hlt_cpus_mask &= ~logical_cpus_mask;
1212
1213 if (! hyperthreading_allowed)
1214 hlt_cpus_mask |= hyperthreading_cpus_mask;
1215
1216 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1217 hlt_cpus_mask &= ~(1<<0);
1218
1219 hlt_logical_cpus = disable;
1220 return (error);
1221 }
1222
1223 static int
1224 sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1225 {
1226 int allowed, error;
1227
1228 allowed = hyperthreading_allowed;
1229 error = sysctl_handle_int(oidp, &allowed, 0, req);
1230 if (error || !req->newptr)
1231 return (error);
1232
1233 if (allowed)
1234 hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1235 else
1236 hlt_cpus_mask |= hyperthreading_cpus_mask;
1237
1238 if (logical_cpus_mask != 0 &&
1239 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1240 hlt_logical_cpus = 1;
1241 else
1242 hlt_logical_cpus = 0;
1243
1244 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1245 hlt_cpus_mask &= ~(1<<0);
1246
1247 hyperthreading_allowed = allowed;
1248 return (error);
1249 }
1250
1251 static void
1252 cpu_hlt_setup(void *dummy __unused)
1253 {
1254
1255 if (logical_cpus_mask != 0) {
1256 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1257 &hlt_logical_cpus);
1258 sysctl_ctx_init(&logical_cpu_clist);
1259 SYSCTL_ADD_PROC(&logical_cpu_clist,
1260 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1261 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1262 sysctl_hlt_logical_cpus, "IU", "");
1263 SYSCTL_ADD_UINT(&logical_cpu_clist,
1264 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1265 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1266 &logical_cpus_mask, 0, "");
1267
1268 if (hlt_logical_cpus)
1269 hlt_cpus_mask |= logical_cpus_mask;
1270
1271 /*
1272 * If necessary for security purposes, force
1273 * hyperthreading off, regardless of the value
1274 * of hlt_logical_cpus.
1275 */
1276 if (hyperthreading_cpus_mask) {
1277 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
1278 &hyperthreading_allowed);
1279 SYSCTL_ADD_PROC(&logical_cpu_clist,
1280 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1281 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1282 0, 0, sysctl_hyperthreading_allowed, "IU", "");
1283 if (! hyperthreading_allowed)
1284 hlt_cpus_mask |= hyperthreading_cpus_mask;
1285 }
1286 }
1287 }
1288 SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1289
1290 int
1291 mp_grab_cpu_hlt(void)
1292 {
1293 u_int mask = PCPU_GET(cpumask);
1294 #ifdef MP_WATCHDOG
1295 u_int cpuid = PCPU_GET(cpuid);
1296 #endif
1297 int retval;
1298
1299 #ifdef MP_WATCHDOG
1300 ap_watchdog(cpuid);
1301 #endif
1302
1303 retval = mask & hlt_cpus_mask;
1304 while (mask & hlt_cpus_mask)
1305 __asm __volatile("sti; hlt" : : : "memory");
1306 return (retval);
1307 }
Cache object: d40a5446a3aaa92b0afcf5106843b995
|