1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 * derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD: releng/5.4/sys/i386/i386/mp_machdep.c 146171 2005-05-13 00:12:57Z nectar $");
28
29 #include "opt_apic.h"
30 #include "opt_cpu.h"
31 #include "opt_kstack_pages.h"
32 #include "opt_mp_watchdog.h"
33
34 #if !defined(lint)
35 #if !defined(SMP)
36 #error How did you get here?
37 #endif
38
39 #if defined(I386_CPU) && !defined(COMPILING_LINT)
40 #error SMP not supported with I386_CPU
41 #endif
42 #ifndef DEV_APIC
43 #error The apic device is required for SMP, add "device apic" to your config file.
44 #endif
45 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
46 #error SMP not supported with CPU_DISABLE_CMPXCHG
47 #endif
48 #endif /* not lint */
49
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/bus.h>
53 #include <sys/cons.h> /* cngetc() */
54 #ifdef GPROF
55 #include <sys/gmon.h>
56 #endif
57 #include <sys/kernel.h>
58 #include <sys/ktr.h>
59 #include <sys/lock.h>
60 #include <sys/malloc.h>
61 #include <sys/memrange.h>
62 #include <sys/mutex.h>
63 #include <sys/pcpu.h>
64 #include <sys/proc.h>
65 #include <sys/smp.h>
66 #include <sys/sysctl.h>
67
68 #include <vm/vm.h>
69 #include <vm/vm_param.h>
70 #include <vm/pmap.h>
71 #include <vm/vm_kern.h>
72 #include <vm/vm_extern.h>
73
74 #include <machine/apicreg.h>
75 #include <machine/clock.h>
76 #include <machine/md_var.h>
77 #include <machine/mp_watchdog.h>
78 #include <machine/pcb.h>
79 #include <machine/smp.h>
80 #include <machine/smptests.h> /** COUNT_XINVLTLB_HITS */
81 #include <machine/specialreg.h>
82 #include <machine/privatespace.h>
83
84 #define WARMBOOT_TARGET 0
85 #define WARMBOOT_OFF (KERNBASE + 0x0467)
86 #define WARMBOOT_SEG (KERNBASE + 0x0469)
87
88 #define CMOS_REG (0x70)
89 #define CMOS_DATA (0x71)
90 #define BIOS_RESET (0x0f)
91 #define BIOS_WARM (0x0a)
92
93 /*
94 * this code MUST be enabled here and in mpboot.s.
95 * it follows the very early stages of AP boot by placing values in CMOS ram.
96 * it NORMALLY will never be needed and thus the primitive method for enabling.
97 *
98 #define CHECK_POINTS
99 */
100
101 #if defined(CHECK_POINTS) && !defined(PC98)
102 #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA))
103 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
104
105 #define CHECK_INIT(D); \
106 CHECK_WRITE(0x34, (D)); \
107 CHECK_WRITE(0x35, (D)); \
108 CHECK_WRITE(0x36, (D)); \
109 CHECK_WRITE(0x37, (D)); \
110 CHECK_WRITE(0x38, (D)); \
111 CHECK_WRITE(0x39, (D));
112
113 #define CHECK_PRINT(S); \
114 printf("%s: %d, %d, %d, %d, %d, %d\n", \
115 (S), \
116 CHECK_READ(0x34), \
117 CHECK_READ(0x35), \
118 CHECK_READ(0x36), \
119 CHECK_READ(0x37), \
120 CHECK_READ(0x38), \
121 CHECK_READ(0x39));
122
123 #else /* CHECK_POINTS */
124
125 #define CHECK_INIT(D)
126 #define CHECK_PRINT(S)
127 #define CHECK_WRITE(A, D)
128
129 #endif /* CHECK_POINTS */
130
131 /*
132 * Values to send to the POST hardware.
133 */
134 #define MP_BOOTADDRESS_POST 0x10
135 #define MP_PROBE_POST 0x11
136 #define MPTABLE_PASS1_POST 0x12
137
138 #define MP_START_POST 0x13
139 #define MP_ENABLE_POST 0x14
140 #define MPTABLE_PASS2_POST 0x15
141
142 #define START_ALL_APS_POST 0x16
143 #define INSTALL_AP_TRAMP_POST 0x17
144 #define START_AP_POST 0x18
145
146 #define MP_ANNOUNCE_POST 0x19
147
148 /* lock region used by kernel profiling */
149 int mcount_lock;
150
151 /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
152 int current_postcode;
153
154 int mp_naps; /* # of Applications processors */
155 int boot_cpu_id = -1; /* designated BSP */
156 extern int nkpt;
157
158 /*
159 * CPU topology map datastructures for HTT.
160 */
161 static struct cpu_group mp_groups[MAXCPU];
162 static struct cpu_top mp_top;
163
164 /* AP uses this during bootstrap. Do not staticize. */
165 char *bootSTK;
166 static int bootAP;
167
168 /* Hotwire a 0->4MB V==P mapping */
169 extern pt_entry_t *KPTphys;
170
171 /* SMP page table page */
172 extern pt_entry_t *SMPpt;
173
174 struct pcb stoppcbs[MAXCPU];
175
176 /* Variables needed for SMP tlb shootdown. */
177 vm_offset_t smp_tlb_addr1;
178 vm_offset_t smp_tlb_addr2;
179 volatile int smp_tlb_wait;
180
181 #ifdef KDB_STOP_NMI
182 volatile cpumask_t ipi_nmi_pending;
183 #endif
184
185 /*
186 * Local data and functions.
187 */
188
189 static u_int logical_cpus;
190
191 /* used to hold the AP's until we are ready to release them */
192 static struct mtx ap_boot_mtx;
193
194 /* Set to 1 once we're ready to let the APs out of the pen. */
195 static volatile int aps_ready = 0;
196
197 /*
198 * Store data from cpu_add() until later in the boot when we actually setup
199 * the APs.
200 */
201 struct cpu_info {
202 int cpu_present:1;
203 int cpu_bsp:1;
204 int cpu_disabled:1;
205 } static cpu_info[MAXCPU];
206 static int cpu_apic_ids[MAXCPU];
207
208 /* Holds pending bitmap based IPIs per CPU */
209 static volatile u_int cpu_ipi_pending[MAXCPU];
210
211 static u_int boot_address;
212
213 static void set_logical_apic_ids(void);
214 static int start_all_aps(void);
215 static void install_ap_tramp(void);
216 static int start_ap(int apic_id);
217 static void release_aps(void *dummy);
218
219 static int hlt_logical_cpus;
220 static u_int hyperthreading_cpus;
221 static cpumask_t hyperthreading_cpus_mask;
222 static int hyperthreading_allowed;
223 static struct sysctl_ctx_list logical_cpu_clist;
224
225 static void
226 mem_range_AP_init(void)
227 {
228 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
229 mem_range_softc.mr_op->initAP(&mem_range_softc);
230 }
231
232 void
233 mp_topology(void)
234 {
235 struct cpu_group *group;
236 int logical_cpus;
237 int apic_id;
238 int groups;
239 int cpu;
240
241 /* Build the smp_topology map. */
242 /* Nothing to do if there is no HTT support. */
243 if ((cpu_feature & CPUID_HTT) == 0)
244 return;
245 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
246 if (logical_cpus <= 1)
247 return;
248 group = &mp_groups[0];
249 groups = 1;
250 for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
251 if (!cpu_info[apic_id].cpu_present)
252 continue;
253 /*
254 * If the current group has members and we're not a logical
255 * cpu, create a new group.
256 */
257 if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) {
258 group++;
259 groups++;
260 }
261 group->cg_count++;
262 group->cg_mask |= 1 << cpu;
263 cpu++;
264 }
265
266 mp_top.ct_count = groups;
267 mp_top.ct_group = mp_groups;
268 smp_topology = &mp_top;
269 }
270
271
272 /*
273 * Calculate usable address in base memory for AP trampoline code.
274 */
275 u_int
276 mp_bootaddress(u_int basemem)
277 {
278 POSTCODE(MP_BOOTADDRESS_POST);
279
280 boot_address = trunc_page(basemem); /* round down to 4k boundary */
281 if ((basemem - boot_address) < bootMP_size)
282 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */
283
284 return boot_address;
285 }
286
287 void
288 cpu_add(u_int apic_id, char boot_cpu)
289 {
290
291 if (apic_id >= MAXCPU) {
292 printf("SMP: CPU %d exceeds maximum CPU %d, ignoring\n",
293 apic_id, MAXCPU - 1);
294 return;
295 }
296 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
297 apic_id));
298 cpu_info[apic_id].cpu_present = 1;
299 if (boot_cpu) {
300 KASSERT(boot_cpu_id == -1,
301 ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
302 boot_cpu_id));
303 boot_cpu_id = apic_id;
304 cpu_info[apic_id].cpu_bsp = 1;
305 }
306 mp_ncpus++;
307 if (bootverbose)
308 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
309 "AP");
310
311 }
312
313 void
314 cpu_mp_setmaxid(void)
315 {
316
317 mp_maxid = MAXCPU - 1;
318 }
319
320 int
321 cpu_mp_probe(void)
322 {
323
324 /*
325 * Always record BSP in CPU map so that the mbuf init code works
326 * correctly.
327 */
328 all_cpus = 1;
329 if (mp_ncpus == 0) {
330 /*
331 * No CPUs were found, so this must be a UP system. Setup
332 * the variables to represent a system with a single CPU
333 * with an id of 0.
334 */
335 mp_ncpus = 1;
336 return (0);
337 }
338
339 /* At least one CPU was found. */
340 if (mp_ncpus == 1) {
341 /*
342 * One CPU was found, so this must be a UP system with
343 * an I/O APIC.
344 */
345 return (0);
346 }
347
348 /* At least two CPUs were found. */
349 return (1);
350 }
351
352 /*
353 * Initialize the IPI handlers and start up the AP's.
354 */
355 void
356 cpu_mp_start(void)
357 {
358 int i;
359 u_int threads_per_cache, p[4];
360
361 POSTCODE(MP_START_POST);
362
363 /* Initialize the logical ID to APIC ID table. */
364 for (i = 0; i < MAXCPU; i++) {
365 cpu_apic_ids[i] = -1;
366 cpu_ipi_pending[i] = 0;
367 }
368
369 /* Install an inter-CPU IPI for TLB invalidation */
370 setidt(IPI_INVLTLB, IDTVEC(invltlb),
371 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
372 setidt(IPI_INVLPG, IDTVEC(invlpg),
373 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
374 setidt(IPI_INVLRNG, IDTVEC(invlrng),
375 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
376
377 /* Install an inter-CPU IPI for lazy pmap release */
378 setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
379 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
380
381 /* Install an inter-CPU IPI for all-CPU rendezvous */
382 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
383 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
384
385 /* Install generic inter-CPU IPI handler */
386 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
387 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
388
389 /* Install an inter-CPU IPI for CPU stop/restart */
390 setidt(IPI_STOP, IDTVEC(cpustop),
391 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
392
393
394 /* Set boot_cpu_id if needed. */
395 if (boot_cpu_id == -1) {
396 boot_cpu_id = PCPU_GET(apic_id);
397 cpu_info[boot_cpu_id].cpu_bsp = 1;
398 } else
399 KASSERT(boot_cpu_id == PCPU_GET(apic_id),
400 ("BSP's APIC ID doesn't match boot_cpu_id"));
401 cpu_apic_ids[0] = boot_cpu_id;
402
403 /* Start each Application Processor */
404 start_all_aps();
405
406 /* Setup the initial logical CPUs info. */
407 logical_cpus = logical_cpus_mask = 0;
408 if (cpu_feature & CPUID_HTT)
409 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
410
411 /*
412 * Work out if hyperthreading is *really* enabled. This
413 * is made really ugly by the fact that processors lie: Dual
414 * core processors claim to be hyperthreaded even when they're
415 * not, presumably because they want to be treated the same
416 * way as HTT with respect to per-cpu software licensing.
417 * At the time of writing (May 12, 2005) the only hyperthreaded
418 * cpus are from Intel, and Intel's dual-core processors can be
419 * identified via the "deterministic cache parameters" cpuid
420 * calls.
421 */
422 /*
423 * First determine if this is an Intel processor which claims
424 * to have hyperthreading support.
425 */
426 if ((cpu_feature & CPUID_HTT) &&
427 (strcmp(cpu_vendor, "GenuineIntel") == 0)) {
428 /*
429 * If the "deterministic cache parameters" cpuid calls
430 * are available, use them.
431 */
432 if (cpu_high >= 4) {
433 /* Ask the processor about up to 32 caches. */
434 for (i = 0; i < 32; i++) {
435 cpuid_count(4, i, p);
436 threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
437 if (hyperthreading_cpus < threads_per_cache)
438 hyperthreading_cpus = threads_per_cache;
439 if ((p[0] & 0x1f) == 0)
440 break;
441 }
442 }
443
444 /*
445 * If the deterministic cache parameters are not
446 * available, or if no caches were reported to exist,
447 * just accept what the HTT flag indicated.
448 */
449 if (hyperthreading_cpus == 0)
450 hyperthreading_cpus = logical_cpus;
451 }
452
453 set_logical_apic_ids();
454 }
455
456
457 /*
458 * Print various information about the SMP system hardware and setup.
459 */
460 void
461 cpu_mp_announce(void)
462 {
463 int i, x;
464
465 POSTCODE(MP_ANNOUNCE_POST);
466
467 /* List CPUs */
468 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
469 for (i = 1, x = 0; x < MAXCPU; x++) {
470 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
471 continue;
472 if (cpu_info[x].cpu_disabled)
473 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x);
474 else {
475 KASSERT(i < mp_ncpus,
476 ("mp_ncpus and actual cpus are out of whack"));
477 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
478 }
479 }
480 }
481
482 /*
483 * AP CPU's call this to initialize themselves.
484 */
485 void
486 init_secondary(void)
487 {
488 int gsel_tss;
489 int x, myid;
490 u_int cr0;
491
492 /* bootAP is set in start_ap() to our ID. */
493 myid = bootAP;
494 gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
495 gdt_segs[GPROC0_SEL].ssd_base =
496 (int) &SMP_prvspace[myid].pcpu.pc_common_tss;
497 SMP_prvspace[myid].pcpu.pc_prvspace =
498 &SMP_prvspace[myid].pcpu;
499
500 for (x = 0; x < NGDT; x++) {
501 ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
502 }
503
504 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
505 r_gdt.rd_base = (int) &gdt[myid * NGDT];
506 lgdt(&r_gdt); /* does magic intra-segment return */
507
508 lidt(&r_idt);
509
510 lldt(_default_ldt);
511 PCPU_SET(currentldt, _default_ldt);
512
513 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
514 gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
515 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
516 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
517 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
518 PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
519 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
520 ltr(gsel_tss);
521
522 /*
523 * Set to a known state:
524 * Set by mpboot.s: CR0_PG, CR0_PE
525 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
526 */
527 cr0 = rcr0();
528 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
529 load_cr0(cr0);
530 CHECK_WRITE(0x38, 5);
531
532 /* Disable local APIC just to be sure. */
533 lapic_disable();
534
535 /* signal our startup to the BSP. */
536 mp_naps++;
537 CHECK_WRITE(0x39, 6);
538
539 /* Spin until the BSP releases the AP's. */
540 while (!aps_ready)
541 ia32_pause();
542
543 /* BSP may have changed PTD while we were waiting */
544 invltlb();
545 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
546
547 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
548 lidt(&r_idt);
549 #endif
550
551 /* set up CPU registers and state */
552 cpu_setregs();
553
554 /* set up FPU state on the AP */
555 npxinit(__INITIAL_NPXCW__);
556
557 /* set up SSE registers */
558 enable_sse();
559
560 /* A quick check from sanity claus */
561 if (PCPU_GET(apic_id) != lapic_id()) {
562 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
563 printf("SMP: actual apic_id = %d\n", lapic_id());
564 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
565 printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]);
566 panic("cpuid mismatch! boom!!");
567 }
568
569 mtx_lock_spin(&ap_boot_mtx);
570
571 /* Init local apic for irq's */
572 lapic_setup();
573
574 /* Set memory range attributes for this CPU to match the BSP */
575 mem_range_AP_init();
576
577 smp_cpus++;
578
579 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
580 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
581
582 /* Determine if we are a logical CPU. */
583 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
584 logical_cpus_mask |= PCPU_GET(cpumask);
585
586 /* Determine if we are a hyperthread. */
587 if (hyperthreading_cpus > 1 &&
588 PCPU_GET(apic_id) % hyperthreading_cpus != 0)
589 hyperthreading_cpus_mask |= PCPU_GET(cpumask);
590
591 /* Build our map of 'other' CPUs. */
592 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
593
594 if (bootverbose)
595 lapic_dump("AP");
596
597 if (smp_cpus == mp_ncpus) {
598 /* enable IPI's, tlb shootdown, freezes etc */
599 atomic_store_rel_int(&smp_started, 1);
600 smp_active = 1; /* historic */
601 }
602
603 mtx_unlock_spin(&ap_boot_mtx);
604
605 /* wait until all the AP's are up */
606 while (smp_started == 0)
607 ia32_pause();
608
609 /* ok, now grab sched_lock and enter the scheduler */
610 mtx_lock_spin(&sched_lock);
611
612 binuptime(PCPU_PTR(switchtime));
613 PCPU_SET(switchticks, ticks);
614
615 cpu_throw(NULL, choosethread()); /* doesn't return */
616
617 panic("scheduler returned us to %s", __func__);
618 /* NOTREACHED */
619 }
620
621 /*******************************************************************
622 * local functions and data
623 */
624
625 /*
626 * Set the APIC logical IDs.
627 *
628 * We want to cluster logical CPU's within the same APIC ID cluster.
629 * Since logical CPU's are aligned simply filling in the clusters in
630 * APIC ID order works fine. Note that this does not try to balance
631 * the number of CPU's in each cluster. (XXX?)
632 */
633 static void
634 set_logical_apic_ids(void)
635 {
636 u_int apic_id, cluster, cluster_id;
637
638 /* Force us to allocate cluster 0 at the start. */
639 cluster = -1;
640 cluster_id = APIC_MAX_INTRACLUSTER_ID;
641 for (apic_id = 0; apic_id < MAXCPU; apic_id++) {
642 if (!cpu_info[apic_id].cpu_present)
643 continue;
644 if (cluster_id == APIC_MAX_INTRACLUSTER_ID) {
645 cluster = ioapic_next_logical_cluster();
646 cluster_id = 0;
647 } else
648 cluster_id++;
649 if (bootverbose)
650 printf("APIC ID: physical %u, logical %u:%u\n",
651 apic_id, cluster, cluster_id);
652 lapic_set_logical_id(apic_id, cluster, cluster_id);
653 }
654 }
655
656 /*
657 * start each AP in our list
658 */
659 static int
660 start_all_aps(void)
661 {
662 #ifndef PC98
663 u_char mpbiosreason;
664 #endif
665 u_long mpbioswarmvec;
666 struct pcpu *pc;
667 char *stack;
668 uintptr_t kptbase;
669 int i, pg, apic_id, cpu;
670
671 POSTCODE(START_ALL_APS_POST);
672
673 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
674
675 /* install the AP 1st level boot code */
676 install_ap_tramp();
677
678 /* save the current value of the warm-start vector */
679 mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
680 #ifndef PC98
681 outb(CMOS_REG, BIOS_RESET);
682 mpbiosreason = inb(CMOS_DATA);
683 #endif
684
685 /* set up temporary P==V mapping for AP boot */
686 /* XXX this is a hack, we should boot the AP on its own stack/PTD */
687 kptbase = (uintptr_t)(void *)KPTphys;
688 for (i = 0; i < NKPT; i++)
689 PTD[i] = (pd_entry_t)(PG_V | PG_RW |
690 ((kptbase + i * PAGE_SIZE) & PG_FRAME));
691 invltlb();
692
693 /* start each AP */
694 for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
695
696 /* Ignore non-existent CPUs and the BSP. */
697 if (!cpu_info[apic_id].cpu_present ||
698 cpu_info[apic_id].cpu_bsp)
699 continue;
700
701 /* Don't use this CPU if it has been disabled by a tunable. */
702 if (resource_disabled("lapic", apic_id)) {
703 cpu_info[apic_id].cpu_disabled = 1;
704 mp_ncpus--;
705 continue;
706 }
707
708 cpu++;
709
710 /* save APIC ID for this logical ID */
711 cpu_apic_ids[cpu] = apic_id;
712
713 /* first page of AP's private space */
714 pg = cpu * i386_btop(sizeof(struct privatespace));
715
716 /* allocate a new private data page */
717 pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE);
718
719 /* wire it into the private page table page */
720 SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc));
721
722 /* allocate and set up an idle stack data page */
723 stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */
724 for (i = 0; i < KSTACK_PAGES; i++)
725 SMPpt[pg + 1 + i] = (pt_entry_t)
726 (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
727
728 /* prime data page for it to use */
729 pcpu_init(pc, cpu, sizeof(struct pcpu));
730 pc->pc_apic_id = apic_id;
731
732 /* setup a vector to our boot code */
733 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
734 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
735 #ifndef PC98
736 outb(CMOS_REG, BIOS_RESET);
737 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
738 #endif
739
740 bootSTK = &SMP_prvspace[cpu].idlekstack[KSTACK_PAGES *
741 PAGE_SIZE];
742 bootAP = cpu;
743
744 /* attempt to start the Application Processor */
745 CHECK_INIT(99); /* setup checkpoints */
746 if (!start_ap(apic_id)) {
747 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
748 CHECK_PRINT("trace"); /* show checkpoints */
749 /* better panic as the AP may be running loose */
750 printf("panic y/n? [y] ");
751 if (cngetc() != 'n')
752 panic("bye-bye");
753 }
754 CHECK_PRINT("trace"); /* show checkpoints */
755
756 all_cpus |= (1 << cpu); /* record AP in CPU map */
757 }
758
759 /* build our map of 'other' CPUs */
760 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
761
762 /* restore the warmstart vector */
763 *(u_long *) WARMBOOT_OFF = mpbioswarmvec;
764 #ifndef PC98
765 outb(CMOS_REG, BIOS_RESET);
766 outb(CMOS_DATA, mpbiosreason);
767 #endif
768
769 /*
770 * Set up the idle context for the BSP. Similar to above except
771 * that some was done by locore, some by pmap.c and some is implicit
772 * because the BSP is cpu#0 and the page is initially zero and also
773 * because we can refer to variables by name on the BSP..
774 */
775
776 /* Allocate and setup BSP idle stack */
777 stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
778 for (i = 0; i < KSTACK_PAGES; i++)
779 SMPpt[1 + i] = (pt_entry_t)
780 (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
781
782 for (i = 0; i < NKPT; i++)
783 PTD[i] = 0;
784 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
785
786 /* number of APs actually started */
787 return mp_naps;
788 }
789
790 /*
791 * load the 1st level AP boot code into base memory.
792 */
793
794 /* targets for relocation */
795 extern void bigJump(void);
796 extern void bootCodeSeg(void);
797 extern void bootDataSeg(void);
798 extern void MPentry(void);
799 extern u_int MP_GDT;
800 extern u_int mp_gdtbase;
801
802 static void
803 install_ap_tramp(void)
804 {
805 int x;
806 int size = *(int *) ((u_long) & bootMP_size);
807 vm_offset_t va = boot_address + KERNBASE;
808 u_char *src = (u_char *) ((u_long) bootMP);
809 u_char *dst = (u_char *) va;
810 u_int boot_base = (u_int) bootMP;
811 u_int8_t *dst8;
812 u_int16_t *dst16;
813 u_int32_t *dst32;
814
815 POSTCODE(INSTALL_AP_TRAMP_POST);
816
817 KASSERT (size <= PAGE_SIZE,
818 ("'size' do not fit into PAGE_SIZE, as expected."));
819 pmap_kenter(va, boot_address);
820 pmap_invalidate_page (kernel_pmap, va);
821 for (x = 0; x < size; ++x)
822 *dst++ = *src++;
823
824 /*
825 * modify addresses in code we just moved to basemem. unfortunately we
826 * need fairly detailed info about mpboot.s for this to work. changes
827 * to mpboot.s might require changes here.
828 */
829
830 /* boot code is located in KERNEL space */
831 dst = (u_char *) va;
832
833 /* modify the lgdt arg */
834 dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
835 *dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
836
837 /* modify the ljmp target for MPentry() */
838 dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
839 *dst32 = ((u_int) MPentry - KERNBASE);
840
841 /* modify the target for boot code segment */
842 dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
843 dst8 = (u_int8_t *) (dst16 + 1);
844 *dst16 = (u_int) boot_address & 0xffff;
845 *dst8 = ((u_int) boot_address >> 16) & 0xff;
846
847 /* modify the target for boot data segment */
848 dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
849 dst8 = (u_int8_t *) (dst16 + 1);
850 *dst16 = (u_int) boot_address & 0xffff;
851 *dst8 = ((u_int) boot_address >> 16) & 0xff;
852 }
853
854 /*
855 * This function starts the AP (application processor) identified
856 * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
857 * to accomplish this. This is necessary because of the nuances
858 * of the different hardware we might encounter. It isn't pretty,
859 * but it seems to work.
860 */
861 static int
862 start_ap(int apic_id)
863 {
864 int vector, ms;
865 int cpus;
866
867 POSTCODE(START_AP_POST);
868
869 /* calculate the vector */
870 vector = (boot_address >> 12) & 0xff;
871
872 /* used as a watchpoint to signal AP startup */
873 cpus = mp_naps;
874
875 /*
876 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
877 * and running the target CPU. OR this INIT IPI might be latched (P5
878 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
879 * ignored.
880 */
881
882 /* do an INIT IPI: assert RESET */
883 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
884 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
885
886 /* wait for pending status end */
887 lapic_ipi_wait(-1);
888
889 /* do an INIT IPI: deassert RESET */
890 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
891 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
892
893 /* wait for pending status end */
894 DELAY(10000); /* wait ~10mS */
895 lapic_ipi_wait(-1);
896
897 /*
898 * next we do a STARTUP IPI: the previous INIT IPI might still be
899 * latched, (P5 bug) this 1st STARTUP would then terminate
900 * immediately, and the previously started INIT IPI would continue. OR
901 * the previous INIT IPI has already run. and this STARTUP IPI will
902 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
903 * will run.
904 */
905
906 /* do a STARTUP IPI */
907 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
908 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
909 vector, apic_id);
910 lapic_ipi_wait(-1);
911 DELAY(200); /* wait ~200uS */
912
913 /*
914 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
915 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
916 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
917 * recognized after hardware RESET or INIT IPI.
918 */
919
920 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
921 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
922 vector, apic_id);
923 lapic_ipi_wait(-1);
924 DELAY(200); /* wait ~200uS */
925
926 /* Wait up to 5 seconds for it to start. */
927 for (ms = 0; ms < 5000; ms++) {
928 if (mp_naps > cpus)
929 return 1; /* return SUCCESS */
930 DELAY(1000);
931 }
932 return 0; /* return FAILURE */
933 }
934
935 #ifdef COUNT_XINVLTLB_HITS
936 u_int xhits_gbl[MAXCPU];
937 u_int xhits_pg[MAXCPU];
938 u_int xhits_rng[MAXCPU];
939 SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
940 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
941 sizeof(xhits_gbl), "IU", "");
942 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
943 sizeof(xhits_pg), "IU", "");
944 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
945 sizeof(xhits_rng), "IU", "");
946
947 u_int ipi_global;
948 u_int ipi_page;
949 u_int ipi_range;
950 u_int ipi_range_size;
951 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
952 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
953 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
954 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
955 0, "");
956
957 u_int ipi_masked_global;
958 u_int ipi_masked_page;
959 u_int ipi_masked_range;
960 u_int ipi_masked_range_size;
961 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
962 &ipi_masked_global, 0, "");
963 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
964 &ipi_masked_page, 0, "");
965 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
966 &ipi_masked_range, 0, "");
967 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
968 &ipi_masked_range_size, 0, "");
969 #endif /* COUNT_XINVLTLB_HITS */
970
971 /*
972 * Flush the TLB on all other CPU's
973 */
974 static void
975 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
976 {
977 u_int ncpu;
978
979 ncpu = mp_ncpus - 1; /* does not shootdown self */
980 if (ncpu < 1)
981 return; /* no other cpus */
982 mtx_assert(&smp_ipi_mtx, MA_OWNED);
983 smp_tlb_addr1 = addr1;
984 smp_tlb_addr2 = addr2;
985 atomic_store_rel_int(&smp_tlb_wait, 0);
986 ipi_all_but_self(vector);
987 /*
988 * Enable interrupts here to workaround Opteron Errata 106.
989 * The while loop runs entirely out of instruction cache,
990 * which blocks updates to the cache from other CPUs.
991 * Interrupts break the lock, allowing the write to post.
992 */
993 enable_intr();
994 while (smp_tlb_wait < ncpu)
995 ia32_pause();
996 disable_intr();
997 }
998
999 /*
1000 * This is about as magic as it gets. fortune(1) has got similar code
1001 * for reversing bits in a word. Who thinks up this stuff??
1002 *
1003 * Yes, it does appear to be consistently faster than:
1004 * while (i = ffs(m)) {
1005 * m >>= i;
1006 * bits++;
1007 * }
1008 * and
1009 * while (lsb = (m & -m)) { // This is magic too
1010 * m &= ~lsb; // or: m ^= lsb
1011 * bits++;
1012 * }
1013 * Both of these latter forms do some very strange things on gcc-3.1 with
1014 * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
1015 * There is probably an SSE or MMX popcnt instruction.
1016 *
1017 * I wonder if this should be in libkern?
1018 *
1019 * XXX Stop the presses! Another one:
1020 * static __inline u_int32_t
1021 * popcnt1(u_int32_t v)
1022 * {
1023 * v -= ((v >> 1) & 0x55555555);
1024 * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
1025 * v = (v + (v >> 4)) & 0x0F0F0F0F;
1026 * return (v * 0x01010101) >> 24;
1027 * }
1028 * The downside is that it has a multiply. With a pentium3 with
1029 * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
1030 * an imull, and in that case it is faster. In most other cases
1031 * it appears slightly slower.
1032 *
1033 * Another variant (also from fortune):
1034 * #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
1035 * #define BX_(x) ((x) - (((x)>>1)&0x77777777) \
1036 * - (((x)>>2)&0x33333333) \
1037 * - (((x)>>3)&0x11111111))
1038 */
1039 static __inline u_int32_t
1040 popcnt(u_int32_t m)
1041 {
1042
1043 m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
1044 m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
1045 m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
1046 m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
1047 m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
1048 return m;
1049 }
1050
1051 static void
1052 smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1053 {
1054 int ncpu, othercpus;
1055
1056 othercpus = mp_ncpus - 1;
1057 if (mask == (u_int)-1) {
1058 ncpu = othercpus;
1059 if (ncpu < 1)
1060 return;
1061 } else {
1062 mask &= ~PCPU_GET(cpumask);
1063 if (mask == 0)
1064 return;
1065 ncpu = popcnt(mask);
1066 if (ncpu > othercpus) {
1067 /* XXX this should be a panic offence */
1068 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1069 ncpu, othercpus);
1070 ncpu = othercpus;
1071 }
1072 /* XXX should be a panic, implied by mask == 0 above */
1073 if (ncpu < 1)
1074 return;
1075 }
1076 mtx_assert(&smp_ipi_mtx, MA_OWNED);
1077 smp_tlb_addr1 = addr1;
1078 smp_tlb_addr2 = addr2;
1079 atomic_store_rel_int(&smp_tlb_wait, 0);
1080 if (mask == (u_int)-1)
1081 ipi_all_but_self(vector);
1082 else
1083 ipi_selected(mask, vector);
1084 /*
1085 * Enable interrupts here to workaround Opteron Errata 106.
1086 * The while loop runs entirely out of instruction cache,
1087 * which blocks updates to the cache from other CPUs.
1088 * Interrupts break the lock, allowing the write to post.
1089 */
1090 enable_intr();
1091 while (smp_tlb_wait < ncpu)
1092 ia32_pause();
1093 disable_intr();
1094 }
1095
1096 void
1097 smp_invltlb(void)
1098 {
1099 if (smp_started) {
1100 smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1101 #ifdef COUNT_XINVLTLB_HITS
1102 ipi_global++;
1103 #endif
1104 }
1105 }
1106
1107 void
1108 smp_invlpg(vm_offset_t addr)
1109 {
1110 if (smp_started) {
1111 smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1112 #ifdef COUNT_XINVLTLB_HITS
1113 ipi_page++;
1114 #endif
1115 }
1116 }
1117
1118 void
1119 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1120 {
1121 if (smp_started) {
1122 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1123 #ifdef COUNT_XINVLTLB_HITS
1124 ipi_range++;
1125 ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1126 #endif
1127 }
1128 }
1129
1130 void
1131 smp_masked_invltlb(u_int mask)
1132 {
1133 if (smp_started) {
1134 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1135 #ifdef COUNT_XINVLTLB_HITS
1136 ipi_masked_global++;
1137 #endif
1138 }
1139 }
1140
1141 void
1142 smp_masked_invlpg(u_int mask, vm_offset_t addr)
1143 {
1144 if (smp_started) {
1145 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1146 #ifdef COUNT_XINVLTLB_HITS
1147 ipi_masked_page++;
1148 #endif
1149 }
1150 }
1151
1152 void
1153 smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
1154 {
1155 if (smp_started) {
1156 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1157 #ifdef COUNT_XINVLTLB_HITS
1158 ipi_masked_range++;
1159 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
1160 #endif
1161 }
1162 }
1163
1164
1165 /*
1166 * For statclock, we send an IPI to all CPU's to have them call this
1167 * function.
1168 */
1169
1170 void
1171 forward_statclock(void)
1172 {
1173 int map;
1174
1175 CTR0(KTR_SMP, "forward_statclock");
1176
1177 if (!smp_started || cold || panicstr)
1178 return;
1179
1180 map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
1181 if (map != 0)
1182 ipi_selected(map, IPI_STATCLOCK);
1183 }
1184
1185 /*
1186 * For each hardclock(), we send an IPI to all other CPU's to have them
1187 * execute this function. It would be nice to reduce contention on
1188 * sched_lock if we could simply peek at the CPU to determine the user/kernel
1189 * state and call hardclock_process() on the CPU receiving the clock interrupt
1190 * and then just use a simple IPI to handle any ast's if needed.
1191 */
1192
1193 void
1194 forward_hardclock(void)
1195 {
1196 u_int map;
1197
1198 CTR0(KTR_SMP, "forward_hardclock");
1199
1200 if (!smp_started || cold || panicstr)
1201 return;
1202
1203 map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
1204 if (map != 0)
1205 ipi_selected(map, IPI_HARDCLOCK);
1206 }
1207
1208 void
1209 ipi_bitmap_handler(struct clockframe frame)
1210 {
1211 int cpu = PCPU_GET(cpuid);
1212 u_int ipi_bitmap;
1213 struct thread *td;
1214
1215 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1216
1217 critical_enter();
1218
1219 /* Nothing to do for AST */
1220
1221 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1222 td = curthread;
1223 td->td_intr_nesting_level++;
1224 hardclock_process(&frame);
1225 td->td_intr_nesting_level--;
1226 }
1227
1228 if (ipi_bitmap & (1 << IPI_STATCLOCK)) {
1229 CTR0(KTR_SMP, "forwarded_statclock");
1230
1231 td = curthread;
1232 td->td_intr_nesting_level++;
1233 if (profprocs != 0)
1234 profclock(&frame);
1235 if (pscnt == psdiv)
1236 statclock(&frame);
1237 td->td_intr_nesting_level--;
1238 }
1239
1240 critical_exit();
1241 }
1242
1243 /*
1244 * send an IPI to a set of cpus.
1245 */
1246 void
1247 ipi_selected(u_int32_t cpus, u_int ipi)
1248 {
1249 int cpu;
1250 u_int bitmap = 0;
1251 u_int old_pending;
1252 u_int new_pending;
1253
1254 if (IPI_IS_BITMAPED(ipi)) {
1255 bitmap = 1 << ipi;
1256 ipi = IPI_BITMAP_VECTOR;
1257 }
1258
1259 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1260 while ((cpu = ffs(cpus)) != 0) {
1261 cpu--;
1262 cpus &= ~(1 << cpu);
1263
1264 KASSERT(cpu_apic_ids[cpu] != -1,
1265 ("IPI to non-existent CPU %d", cpu));
1266
1267 if (bitmap) {
1268 do {
1269 old_pending = cpu_ipi_pending[cpu];
1270 new_pending = old_pending | bitmap;
1271 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1272
1273 if (old_pending)
1274 continue;
1275 }
1276
1277 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1278 }
1279
1280 }
1281
1282 /*
1283 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
1284 */
1285 void
1286 ipi_all(u_int ipi)
1287 {
1288
1289 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1290 lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL);
1291 }
1292
1293 /*
1294 * send an IPI to all CPUs EXCEPT myself
1295 */
1296 void
1297 ipi_all_but_self(u_int ipi)
1298 {
1299
1300 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1301 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1302 }
1303
1304 /*
1305 * send an IPI to myself
1306 */
1307 void
1308 ipi_self(u_int ipi)
1309 {
1310
1311 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1312 lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF);
1313 }
1314
1315 #ifdef KDB_STOP_NMI
1316 /*
1317 * send NMI IPI to selected CPUs
1318 */
1319
1320 #define BEFORE_SPIN 1000000
1321
1322 void
1323 ipi_nmi_selected(u_int32_t cpus)
1324 {
1325
1326 int cpu;
1327 register_t icrlo;
1328
1329 icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1330 | APIC_TRIGMOD_EDGE;
1331
1332 CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1333
1334
1335 atomic_set_int(&ipi_nmi_pending, cpus);
1336
1337
1338 while ((cpu = ffs(cpus)) != 0) {
1339 cpu--;
1340 cpus &= ~(1 << cpu);
1341
1342 KASSERT(cpu_apic_ids[cpu] != -1,
1343 ("IPI NMI to non-existent CPU %d", cpu));
1344
1345 /* Wait for an earlier IPI to finish. */
1346 if (!lapic_ipi_wait(BEFORE_SPIN))
1347 panic("ipi_nmi_selected: previous IPI has not cleared");
1348
1349 lapic_ipi_raw(icrlo,cpu_apic_ids[cpu]);
1350 }
1351 }
1352
1353
1354 int
1355 ipi_nmi_handler()
1356 {
1357 int cpu = PCPU_GET(cpuid);
1358
1359 if(!(atomic_load_acq_int(&ipi_nmi_pending) & (1 << cpu)))
1360 return 1;
1361
1362 atomic_clear_int(&ipi_nmi_pending,1 << cpu);
1363
1364 savectx(&stoppcbs[cpu]);
1365
1366 /* Indicate that we are stopped */
1367 atomic_set_int(&stopped_cpus,1 << cpu);
1368
1369
1370 /* Wait for restart */
1371 while(!(atomic_load_acq_int(&started_cpus) & (1 << cpu)))
1372 ia32_pause();
1373
1374 atomic_clear_int(&started_cpus,1 << cpu);
1375 atomic_clear_int(&stopped_cpus,1 << cpu);
1376
1377 if(cpu == 0 && cpustop_restartfunc != NULL)
1378 cpustop_restartfunc();
1379
1380 return 0;
1381 }
1382
1383 #endif /* KDB_STOP_NMI */
1384
1385 /*
1386 * This is called once the rest of the system is up and running and we're
1387 * ready to let the AP's out of the pen.
1388 */
1389 static void
1390 release_aps(void *dummy __unused)
1391 {
1392
1393 if (mp_ncpus == 1)
1394 return;
1395 mtx_lock_spin(&sched_lock);
1396 atomic_store_rel_int(&aps_ready, 1);
1397 while (smp_started == 0)
1398 ia32_pause();
1399 mtx_unlock_spin(&sched_lock);
1400 }
1401 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1402
1403 static int
1404 sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1405 {
1406 u_int mask;
1407 int error;
1408
1409 mask = hlt_cpus_mask;
1410 error = sysctl_handle_int(oidp, &mask, 0, req);
1411 if (error || !req->newptr)
1412 return (error);
1413
1414 if (logical_cpus_mask != 0 &&
1415 (mask & logical_cpus_mask) == logical_cpus_mask)
1416 hlt_logical_cpus = 1;
1417 else
1418 hlt_logical_cpus = 0;
1419
1420 if (! hyperthreading_allowed)
1421 mask |= hyperthreading_cpus_mask;
1422
1423 if ((mask & all_cpus) == all_cpus)
1424 mask &= ~(1<<0);
1425 hlt_cpus_mask = mask;
1426 return (error);
1427 }
1428 SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1429 0, 0, sysctl_hlt_cpus, "IU",
1430 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2.");
1431
1432 static int
1433 sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1434 {
1435 int disable, error;
1436
1437 disable = hlt_logical_cpus;
1438 error = sysctl_handle_int(oidp, &disable, 0, req);
1439 if (error || !req->newptr)
1440 return (error);
1441
1442 if (disable)
1443 hlt_cpus_mask |= logical_cpus_mask;
1444 else
1445 hlt_cpus_mask &= ~logical_cpus_mask;
1446
1447 if (! hyperthreading_allowed)
1448 hlt_cpus_mask |= hyperthreading_cpus_mask;
1449
1450 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1451 hlt_cpus_mask &= ~(1<<0);
1452
1453 hlt_logical_cpus = disable;
1454 return (error);
1455 }
1456
1457 static int
1458 sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1459 {
1460 int allowed, error;
1461
1462 allowed = hyperthreading_allowed;
1463 error = sysctl_handle_int(oidp, &allowed, 0, req);
1464 if (error || !req->newptr)
1465 return (error);
1466
1467 if (allowed)
1468 hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1469 else
1470 hlt_cpus_mask |= hyperthreading_cpus_mask;
1471
1472 if (logical_cpus_mask != 0 &&
1473 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1474 hlt_logical_cpus = 1;
1475 else
1476 hlt_logical_cpus = 0;
1477
1478 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1479 hlt_cpus_mask &= ~(1<<0);
1480
1481 hyperthreading_allowed = allowed;
1482 return (error);
1483 }
1484
1485 static void
1486 cpu_hlt_setup(void *dummy __unused)
1487 {
1488
1489 if (logical_cpus_mask != 0) {
1490 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1491 &hlt_logical_cpus);
1492 sysctl_ctx_init(&logical_cpu_clist);
1493 SYSCTL_ADD_PROC(&logical_cpu_clist,
1494 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1495 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1496 sysctl_hlt_logical_cpus, "IU", "");
1497 SYSCTL_ADD_UINT(&logical_cpu_clist,
1498 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1499 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1500 &logical_cpus_mask, 0, "");
1501
1502 if (hlt_logical_cpus)
1503 hlt_cpus_mask |= logical_cpus_mask;
1504
1505 /*
1506 * If necessary for security purposes, force
1507 * hyperthreading off, regardless of the value
1508 * of hlt_logical_cpus.
1509 */
1510 if (hyperthreading_cpus_mask) {
1511 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
1512 &hyperthreading_allowed);
1513 SYSCTL_ADD_PROC(&logical_cpu_clist,
1514 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1515 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1516 0, 0, sysctl_hyperthreading_allowed, "IU", "");
1517 if (! hyperthreading_allowed)
1518 hlt_cpus_mask |= hyperthreading_cpus_mask;
1519 }
1520 }
1521 }
1522 SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1523
1524 int
1525 mp_grab_cpu_hlt(void)
1526 {
1527 u_int mask = PCPU_GET(cpumask);
1528 #ifdef MP_WATCHDOG
1529 u_int cpuid = PCPU_GET(cpuid);
1530 #endif
1531 int retval;
1532
1533 #ifdef MP_WATCHDOG
1534 ap_watchdog(cpuid);
1535 #endif
1536
1537 retval = mask & hlt_cpus_mask;
1538 while (mask & hlt_cpus_mask)
1539 __asm __volatile("sti; hlt" : : : "memory");
1540 return (retval);
1541 }
Cache object: 43ee55c1c0f7d481c0187333227e26a8
|