1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 * derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD: releng/5.3/sys/i386/i386/mp_machdep.c 146167 2005-05-13 00:02:47Z nectar $");
28
29 #include "opt_apic.h"
30 #include "opt_cpu.h"
31 #include "opt_kstack_pages.h"
32 #include "opt_mp_watchdog.h"
33
34 #if !defined(lint)
35 #if !defined(SMP)
36 #error How did you get here?
37 #endif
38
39 #if defined(I386_CPU) && !defined(COMPILING_LINT)
40 #error SMP not supported with I386_CPU
41 #endif
42 #ifndef DEV_APIC
43 #error The apic device is required for SMP, add "device apic" to your config file.
44 #endif
45 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
46 #error SMP not supported with CPU_DISABLE_CMPXCHG
47 #endif
48 #endif /* not lint */
49
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/bus.h>
53 #include <sys/cons.h> /* cngetc() */
54 #ifdef GPROF
55 #include <sys/gmon.h>
56 #endif
57 #include <sys/kernel.h>
58 #include <sys/ktr.h>
59 #include <sys/lock.h>
60 #include <sys/malloc.h>
61 #include <sys/memrange.h>
62 #include <sys/mutex.h>
63 #include <sys/pcpu.h>
64 #include <sys/proc.h>
65 #include <sys/smp.h>
66 #include <sys/sysctl.h>
67
68 #include <vm/vm.h>
69 #include <vm/vm_param.h>
70 #include <vm/pmap.h>
71 #include <vm/vm_kern.h>
72 #include <vm/vm_extern.h>
73
74 #include <machine/apicreg.h>
75 #include <machine/clock.h>
76 #include <machine/md_var.h>
77 #include <machine/mp_watchdog.h>
78 #include <machine/pcb.h>
79 #include <machine/smp.h>
80 #include <machine/smptests.h> /** COUNT_XINVLTLB_HITS */
81 #include <machine/specialreg.h>
82 #include <machine/privatespace.h>
83
84 #define WARMBOOT_TARGET 0
85 #define WARMBOOT_OFF (KERNBASE + 0x0467)
86 #define WARMBOOT_SEG (KERNBASE + 0x0469)
87
88 #define CMOS_REG (0x70)
89 #define CMOS_DATA (0x71)
90 #define BIOS_RESET (0x0f)
91 #define BIOS_WARM (0x0a)
92
93 /*
94 * this code MUST be enabled here and in mpboot.s.
95 * it follows the very early stages of AP boot by placing values in CMOS ram.
96 * it NORMALLY will never be needed and thus the primitive method for enabling.
97 *
98 #define CHECK_POINTS
99 */
100
101 #if defined(CHECK_POINTS) && !defined(PC98)
102 #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA))
103 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
104
105 #define CHECK_INIT(D); \
106 CHECK_WRITE(0x34, (D)); \
107 CHECK_WRITE(0x35, (D)); \
108 CHECK_WRITE(0x36, (D)); \
109 CHECK_WRITE(0x37, (D)); \
110 CHECK_WRITE(0x38, (D)); \
111 CHECK_WRITE(0x39, (D));
112
113 #define CHECK_PRINT(S); \
114 printf("%s: %d, %d, %d, %d, %d, %d\n", \
115 (S), \
116 CHECK_READ(0x34), \
117 CHECK_READ(0x35), \
118 CHECK_READ(0x36), \
119 CHECK_READ(0x37), \
120 CHECK_READ(0x38), \
121 CHECK_READ(0x39));
122
123 #else /* CHECK_POINTS */
124
125 #define CHECK_INIT(D)
126 #define CHECK_PRINT(S)
127 #define CHECK_WRITE(A, D)
128
129 #endif /* CHECK_POINTS */
130
131 /*
132 * Values to send to the POST hardware.
133 */
134 #define MP_BOOTADDRESS_POST 0x10
135 #define MP_PROBE_POST 0x11
136 #define MPTABLE_PASS1_POST 0x12
137
138 #define MP_START_POST 0x13
139 #define MP_ENABLE_POST 0x14
140 #define MPTABLE_PASS2_POST 0x15
141
142 #define START_ALL_APS_POST 0x16
143 #define INSTALL_AP_TRAMP_POST 0x17
144 #define START_AP_POST 0x18
145
146 #define MP_ANNOUNCE_POST 0x19
147
148 /* lock region used by kernel profiling */
149 int mcount_lock;
150
151 /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
152 int current_postcode;
153
154 int mp_naps; /* # of Applications processors */
155 int boot_cpu_id = -1; /* designated BSP */
156 extern int nkpt;
157
158 /*
159 * CPU topology map datastructures for HTT.
160 */
161 static struct cpu_group mp_groups[MAXCPU];
162 static struct cpu_top mp_top;
163
164 /* AP uses this during bootstrap. Do not staticize. */
165 char *bootSTK;
166 static int bootAP;
167
168 /* Hotwire a 0->4MB V==P mapping */
169 extern pt_entry_t *KPTphys;
170
171 /* SMP page table page */
172 extern pt_entry_t *SMPpt;
173
174 struct pcb stoppcbs[MAXCPU];
175
176 /* Variables needed for SMP tlb shootdown. */
177 vm_offset_t smp_tlb_addr1;
178 vm_offset_t smp_tlb_addr2;
179 volatile int smp_tlb_wait;
180
181 /*
182 * Local data and functions.
183 */
184
185 static u_int logical_cpus;
186
187 /* used to hold the AP's until we are ready to release them */
188 static struct mtx ap_boot_mtx;
189
190 /* Set to 1 once we're ready to let the APs out of the pen. */
191 static volatile int aps_ready = 0;
192
193 /*
194 * Store data from cpu_add() until later in the boot when we actually setup
195 * the APs.
196 */
197 struct cpu_info {
198 int cpu_present:1;
199 int cpu_bsp:1;
200 } static cpu_info[MAXCPU];
201 static int cpu_apic_ids[MAXCPU];
202
203 /* Holds pending bitmap based IPIs per CPU */
204 static volatile u_int cpu_ipi_pending[MAXCPU];
205
206 static u_int boot_address;
207
208 static void set_logical_apic_ids(void);
209 static int start_all_aps(void);
210 static void install_ap_tramp(void);
211 static int start_ap(int apic_id);
212 static void release_aps(void *dummy);
213
214 static int hlt_logical_cpus;
215 static u_int hyperthreading_cpus;
216 static cpumask_t hyperthreading_cpus_mask;
217 static int hyperthreading_allowed;
218 static struct sysctl_ctx_list logical_cpu_clist;
219
220 static void
221 mem_range_AP_init(void)
222 {
223 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
224 mem_range_softc.mr_op->initAP(&mem_range_softc);
225 }
226
227 void
228 mp_topology(void)
229 {
230 struct cpu_group *group;
231 int logical_cpus;
232 int apic_id;
233 int groups;
234 int cpu;
235
236 /* Build the smp_topology map. */
237 /* Nothing to do if there is no HTT support. */
238 if ((cpu_feature & CPUID_HTT) == 0)
239 return;
240 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
241 if (logical_cpus <= 1)
242 return;
243 group = &mp_groups[0];
244 groups = 1;
245 for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
246 if (!cpu_info[apic_id].cpu_present)
247 continue;
248 /*
249 * If the current group has members and we're not a logical
250 * cpu, create a new group.
251 */
252 if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) {
253 group++;
254 groups++;
255 }
256 group->cg_count++;
257 group->cg_mask |= 1 << cpu;
258 cpu++;
259 }
260
261 mp_top.ct_count = groups;
262 mp_top.ct_group = mp_groups;
263 smp_topology = &mp_top;
264 }
265
266
267 /*
268 * Calculate usable address in base memory for AP trampoline code.
269 */
270 u_int
271 mp_bootaddress(u_int basemem)
272 {
273 POSTCODE(MP_BOOTADDRESS_POST);
274
275 boot_address = trunc_page(basemem); /* round down to 4k boundary */
276 if ((basemem - boot_address) < bootMP_size)
277 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */
278
279 return boot_address;
280 }
281
282 void
283 cpu_add(u_int apic_id, char boot_cpu)
284 {
285
286 if (apic_id >= MAXCPU) {
287 printf("SMP: CPU %d exceeds maximum CPU %d, ignoring\n",
288 apic_id, MAXCPU - 1);
289 return;
290 }
291 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
292 apic_id));
293 cpu_info[apic_id].cpu_present = 1;
294 if (boot_cpu) {
295 KASSERT(boot_cpu_id == -1,
296 ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
297 boot_cpu_id));
298 boot_cpu_id = apic_id;
299 cpu_info[apic_id].cpu_bsp = 1;
300 }
301 mp_ncpus++;
302 if (bootverbose)
303 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
304 "AP");
305
306 }
307
308 void
309 cpu_mp_setmaxid(void)
310 {
311
312 mp_maxid = MAXCPU - 1;
313 }
314
315 int
316 cpu_mp_probe(void)
317 {
318
319 /*
320 * Always record BSP in CPU map so that the mbuf init code works
321 * correctly.
322 */
323 all_cpus = 1;
324 if (mp_ncpus == 0) {
325 /*
326 * No CPUs were found, so this must be a UP system. Setup
327 * the variables to represent a system with a single CPU
328 * with an id of 0.
329 */
330 mp_ncpus = 1;
331 return (0);
332 }
333
334 /* At least one CPU was found. */
335 if (mp_ncpus == 1) {
336 /*
337 * One CPU was found, so this must be a UP system with
338 * an I/O APIC.
339 */
340 return (0);
341 }
342
343 /* At least two CPUs were found. */
344 return (1);
345 }
346
347 /*
348 * Initialize the IPI handlers and start up the AP's.
349 */
350 void
351 cpu_mp_start(void)
352 {
353 int i;
354 u_int threads_per_cache, p[4];
355
356 POSTCODE(MP_START_POST);
357
358 /* Initialize the logical ID to APIC ID table. */
359 for (i = 0; i < MAXCPU; i++) {
360 cpu_apic_ids[i] = -1;
361 cpu_ipi_pending[i] = 0;
362 }
363
364 /* Install an inter-CPU IPI for TLB invalidation */
365 setidt(IPI_INVLTLB, IDTVEC(invltlb),
366 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
367 setidt(IPI_INVLPG, IDTVEC(invlpg),
368 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
369 setidt(IPI_INVLRNG, IDTVEC(invlrng),
370 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
371
372 /* Install an inter-CPU IPI for lazy pmap release */
373 setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
374 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
375
376 /* Install an inter-CPU IPI for all-CPU rendezvous */
377 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
378 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
379
380 /* Install generic inter-CPU IPI handler */
381 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
382 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
383
384 /* Install an inter-CPU IPI for CPU stop/restart */
385 setidt(IPI_STOP, IDTVEC(cpustop),
386 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
387
388
389 /* Set boot_cpu_id if needed. */
390 if (boot_cpu_id == -1) {
391 boot_cpu_id = PCPU_GET(apic_id);
392 cpu_info[boot_cpu_id].cpu_bsp = 1;
393 } else
394 KASSERT(boot_cpu_id == PCPU_GET(apic_id),
395 ("BSP's APIC ID doesn't match boot_cpu_id"));
396 cpu_apic_ids[0] = boot_cpu_id;
397
398 /* Start each Application Processor */
399 start_all_aps();
400
401 /* Setup the initial logical CPUs info. */
402 logical_cpus = logical_cpus_mask = 0;
403 if (cpu_feature & CPUID_HTT)
404 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
405
406 /*
407 * Work out if hyperthreading is *really* enabled. This
408 * is made really ugly by the fact that processors lie: Dual
409 * core processors claim to be hyperthreaded even when they're
410 * not, presumably because they want to be treated the same
411 * way as HTT with respect to per-cpu software licensing.
412 * At the time of writing (May 12, 2005) the only hyperthreaded
413 * cpus are from Intel, and Intel's dual-core processors can be
414 * identified via the "deterministic cache parameters" cpuid
415 * calls.
416 */
417 /*
418 * First determine if this is an Intel processor which claims
419 * to have hyperthreading support.
420 */
421 if ((cpu_feature & CPUID_HTT) &&
422 (strcmp(cpu_vendor, "GenuineIntel") == 0)) {
423 /*
424 * If the "deterministic cache parameters" cpuid calls
425 * are available, use them.
426 */
427 if (cpu_high >= 4) {
428 /* Ask the processor about up to 32 caches. */
429 for (i = 0; i < 32; i++) {
430 cpuid_count(4, i, p);
431 threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
432 if (hyperthreading_cpus < threads_per_cache)
433 hyperthreading_cpus = threads_per_cache;
434 if ((p[0] & 0x1f) == 0)
435 break;
436 }
437 }
438
439 /*
440 * If the deterministic cache parameters are not
441 * available, or if no caches were reported to exist,
442 * just accept what the HTT flag indicated.
443 */
444 if (hyperthreading_cpus == 0)
445 hyperthreading_cpus = logical_cpus;
446 }
447
448 set_logical_apic_ids();
449 }
450
451
452 /*
453 * Print various information about the SMP system hardware and setup.
454 */
455 void
456 cpu_mp_announce(void)
457 {
458 int i, x;
459
460 POSTCODE(MP_ANNOUNCE_POST);
461
462 /* List CPUs */
463 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
464 for (i = 1, x = 0; x < MAXCPU; x++) {
465 if (cpu_info[x].cpu_present && !cpu_info[x].cpu_bsp) {
466 KASSERT(i < mp_ncpus,
467 ("mp_ncpus and actual cpus are out of whack"));
468 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
469 }
470 }
471 }
472
473 /*
474 * AP CPU's call this to initialize themselves.
475 */
476 void
477 init_secondary(void)
478 {
479 int gsel_tss;
480 int x, myid;
481 u_int cr0;
482
483 /* bootAP is set in start_ap() to our ID. */
484 myid = bootAP;
485 gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
486 gdt_segs[GPROC0_SEL].ssd_base =
487 (int) &SMP_prvspace[myid].pcpu.pc_common_tss;
488 SMP_prvspace[myid].pcpu.pc_prvspace =
489 &SMP_prvspace[myid].pcpu;
490
491 for (x = 0; x < NGDT; x++) {
492 ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
493 }
494
495 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
496 r_gdt.rd_base = (int) &gdt[myid * NGDT];
497 lgdt(&r_gdt); /* does magic intra-segment return */
498
499 lidt(&r_idt);
500
501 lldt(_default_ldt);
502 PCPU_SET(currentldt, _default_ldt);
503
504 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
505 gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
506 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
507 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
508 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
509 PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
510 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
511 ltr(gsel_tss);
512
513 /*
514 * Set to a known state:
515 * Set by mpboot.s: CR0_PG, CR0_PE
516 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
517 */
518 cr0 = rcr0();
519 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
520 load_cr0(cr0);
521 CHECK_WRITE(0x38, 5);
522
523 /* Disable local APIC just to be sure. */
524 lapic_disable();
525
526 /* signal our startup to the BSP. */
527 mp_naps++;
528 CHECK_WRITE(0x39, 6);
529
530 /* Spin until the BSP releases the AP's. */
531 while (!aps_ready)
532 ia32_pause();
533
534 /* BSP may have changed PTD while we were waiting */
535 invltlb();
536 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
537
538 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
539 lidt(&r_idt);
540 #endif
541
542 /* set up CPU registers and state */
543 cpu_setregs();
544
545 /* set up FPU state on the AP */
546 npxinit(__INITIAL_NPXCW__);
547
548 /* set up SSE registers */
549 enable_sse();
550
551 /* A quick check from sanity claus */
552 if (PCPU_GET(apic_id) != lapic_id()) {
553 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
554 printf("SMP: actual apic_id = %d\n", lapic_id());
555 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
556 printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]);
557 panic("cpuid mismatch! boom!!");
558 }
559
560 mtx_lock_spin(&ap_boot_mtx);
561
562 /* Init local apic for irq's */
563 lapic_setup();
564
565 /* Set memory range attributes for this CPU to match the BSP */
566 mem_range_AP_init();
567
568 smp_cpus++;
569
570 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
571 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
572
573 /* Determine if we are a logical CPU. */
574 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
575 logical_cpus_mask |= PCPU_GET(cpumask);
576
577 /* Determine if we are a hyperthread. */
578 if (hyperthreading_cpus > 1 &&
579 PCPU_GET(apic_id) % hyperthreading_cpus != 0)
580 hyperthreading_cpus_mask |= PCPU_GET(cpumask);
581
582 /* Build our map of 'other' CPUs. */
583 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
584
585 if (bootverbose)
586 lapic_dump("AP");
587
588 if (smp_cpus == mp_ncpus) {
589 /* enable IPI's, tlb shootdown, freezes etc */
590 atomic_store_rel_int(&smp_started, 1);
591 smp_active = 1; /* historic */
592 }
593
594 mtx_unlock_spin(&ap_boot_mtx);
595
596 /* wait until all the AP's are up */
597 while (smp_started == 0)
598 ia32_pause();
599
600 /* ok, now grab sched_lock and enter the scheduler */
601 mtx_lock_spin(&sched_lock);
602
603 binuptime(PCPU_PTR(switchtime));
604 PCPU_SET(switchticks, ticks);
605
606 cpu_throw(NULL, choosethread()); /* doesn't return */
607
608 panic("scheduler returned us to %s", __func__);
609 /* NOTREACHED */
610 }
611
612 /*******************************************************************
613 * local functions and data
614 */
615
616 /*
617 * Set the APIC logical IDs.
618 *
619 * We want to cluster logical CPU's within the same APIC ID cluster.
620 * Since logical CPU's are aligned simply filling in the clusters in
621 * APIC ID order works fine. Note that this does not try to balance
622 * the number of CPU's in each cluster. (XXX?)
623 */
624 static void
625 set_logical_apic_ids(void)
626 {
627 u_int apic_id, cluster, cluster_id;
628
629 /* Force us to allocate cluster 0 at the start. */
630 cluster = -1;
631 cluster_id = APIC_MAX_INTRACLUSTER_ID;
632 for (apic_id = 0; apic_id < MAXCPU; apic_id++) {
633 if (!cpu_info[apic_id].cpu_present)
634 continue;
635 if (cluster_id == APIC_MAX_INTRACLUSTER_ID) {
636 cluster = ioapic_next_logical_cluster();
637 cluster_id = 0;
638 } else
639 cluster_id++;
640 if (bootverbose)
641 printf("APIC ID: physical %u, logical %u:%u\n",
642 apic_id, cluster, cluster_id);
643 lapic_set_logical_id(apic_id, cluster, cluster_id);
644 }
645 }
646
647 /*
648 * start each AP in our list
649 */
650 static int
651 start_all_aps(void)
652 {
653 #ifndef PC98
654 u_char mpbiosreason;
655 #endif
656 u_long mpbioswarmvec;
657 struct pcpu *pc;
658 char *stack;
659 uintptr_t kptbase;
660 int i, pg, apic_id, cpu;
661
662 POSTCODE(START_ALL_APS_POST);
663
664 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
665
666 /* install the AP 1st level boot code */
667 install_ap_tramp();
668
669 /* save the current value of the warm-start vector */
670 mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
671 #ifndef PC98
672 outb(CMOS_REG, BIOS_RESET);
673 mpbiosreason = inb(CMOS_DATA);
674 #endif
675
676 /* set up temporary P==V mapping for AP boot */
677 /* XXX this is a hack, we should boot the AP on its own stack/PTD */
678 kptbase = (uintptr_t)(void *)KPTphys;
679 for (i = 0; i < NKPT; i++)
680 PTD[i] = (pd_entry_t)(PG_V | PG_RW |
681 ((kptbase + i * PAGE_SIZE) & PG_FRAME));
682 invltlb();
683
684 /* start each AP */
685 for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
686 if (!cpu_info[apic_id].cpu_present ||
687 cpu_info[apic_id].cpu_bsp)
688 continue;
689 cpu++;
690
691 /* save APIC ID for this logical ID */
692 cpu_apic_ids[cpu] = apic_id;
693
694 /* first page of AP's private space */
695 pg = cpu * i386_btop(sizeof(struct privatespace));
696
697 /* allocate a new private data page */
698 pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE);
699
700 /* wire it into the private page table page */
701 SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc));
702
703 /* allocate and set up an idle stack data page */
704 stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */
705 for (i = 0; i < KSTACK_PAGES; i++)
706 SMPpt[pg + 1 + i] = (pt_entry_t)
707 (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
708
709 /* prime data page for it to use */
710 pcpu_init(pc, cpu, sizeof(struct pcpu));
711 pc->pc_apic_id = apic_id;
712
713 /* setup a vector to our boot code */
714 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
715 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
716 #ifndef PC98
717 outb(CMOS_REG, BIOS_RESET);
718 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
719 #endif
720
721 bootSTK = &SMP_prvspace[cpu].idlekstack[KSTACK_PAGES *
722 PAGE_SIZE];
723 bootAP = cpu;
724
725 /* attempt to start the Application Processor */
726 CHECK_INIT(99); /* setup checkpoints */
727 if (!start_ap(apic_id)) {
728 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
729 CHECK_PRINT("trace"); /* show checkpoints */
730 /* better panic as the AP may be running loose */
731 printf("panic y/n? [y] ");
732 if (cngetc() != 'n')
733 panic("bye-bye");
734 }
735 CHECK_PRINT("trace"); /* show checkpoints */
736
737 all_cpus |= (1 << cpu); /* record AP in CPU map */
738 }
739
740 /* build our map of 'other' CPUs */
741 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
742
743 /* restore the warmstart vector */
744 *(u_long *) WARMBOOT_OFF = mpbioswarmvec;
745 #ifndef PC98
746 outb(CMOS_REG, BIOS_RESET);
747 outb(CMOS_DATA, mpbiosreason);
748 #endif
749
750 /*
751 * Set up the idle context for the BSP. Similar to above except
752 * that some was done by locore, some by pmap.c and some is implicit
753 * because the BSP is cpu#0 and the page is initially zero and also
754 * because we can refer to variables by name on the BSP..
755 */
756
757 /* Allocate and setup BSP idle stack */
758 stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
759 for (i = 0; i < KSTACK_PAGES; i++)
760 SMPpt[1 + i] = (pt_entry_t)
761 (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
762
763 for (i = 0; i < NKPT; i++)
764 PTD[i] = 0;
765 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
766
767 /* number of APs actually started */
768 return mp_naps;
769 }
770
771 /*
772 * load the 1st level AP boot code into base memory.
773 */
774
775 /* targets for relocation */
776 extern void bigJump(void);
777 extern void bootCodeSeg(void);
778 extern void bootDataSeg(void);
779 extern void MPentry(void);
780 extern u_int MP_GDT;
781 extern u_int mp_gdtbase;
782
783 static void
784 install_ap_tramp(void)
785 {
786 int x;
787 int size = *(int *) ((u_long) & bootMP_size);
788 vm_offset_t va = boot_address + KERNBASE;
789 u_char *src = (u_char *) ((u_long) bootMP);
790 u_char *dst = (u_char *) va;
791 u_int boot_base = (u_int) bootMP;
792 u_int8_t *dst8;
793 u_int16_t *dst16;
794 u_int32_t *dst32;
795
796 POSTCODE(INSTALL_AP_TRAMP_POST);
797
798 KASSERT (size <= PAGE_SIZE,
799 ("'size' do not fit into PAGE_SIZE, as expected."));
800 pmap_kenter(va, boot_address);
801 pmap_invalidate_page (kernel_pmap, va);
802 for (x = 0; x < size; ++x)
803 *dst++ = *src++;
804
805 /*
806 * modify addresses in code we just moved to basemem. unfortunately we
807 * need fairly detailed info about mpboot.s for this to work. changes
808 * to mpboot.s might require changes here.
809 */
810
811 /* boot code is located in KERNEL space */
812 dst = (u_char *) va;
813
814 /* modify the lgdt arg */
815 dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
816 *dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
817
818 /* modify the ljmp target for MPentry() */
819 dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
820 *dst32 = ((u_int) MPentry - KERNBASE);
821
822 /* modify the target for boot code segment */
823 dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
824 dst8 = (u_int8_t *) (dst16 + 1);
825 *dst16 = (u_int) boot_address & 0xffff;
826 *dst8 = ((u_int) boot_address >> 16) & 0xff;
827
828 /* modify the target for boot data segment */
829 dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
830 dst8 = (u_int8_t *) (dst16 + 1);
831 *dst16 = (u_int) boot_address & 0xffff;
832 *dst8 = ((u_int) boot_address >> 16) & 0xff;
833 }
834
835 /*
836 * This function starts the AP (application processor) identified
837 * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
838 * to accomplish this. This is necessary because of the nuances
839 * of the different hardware we might encounter. It isn't pretty,
840 * but it seems to work.
841 */
842 static int
843 start_ap(int apic_id)
844 {
845 int vector, ms;
846 int cpus;
847
848 POSTCODE(START_AP_POST);
849
850 /* calculate the vector */
851 vector = (boot_address >> 12) & 0xff;
852
853 /* used as a watchpoint to signal AP startup */
854 cpus = mp_naps;
855
856 /*
857 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
858 * and running the target CPU. OR this INIT IPI might be latched (P5
859 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
860 * ignored.
861 */
862
863 /* do an INIT IPI: assert RESET */
864 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
865 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
866
867 /* wait for pending status end */
868 lapic_ipi_wait(-1);
869
870 /* do an INIT IPI: deassert RESET */
871 lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
872 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
873
874 /* wait for pending status end */
875 DELAY(10000); /* wait ~10mS */
876 lapic_ipi_wait(-1);
877
878 /*
879 * next we do a STARTUP IPI: the previous INIT IPI might still be
880 * latched, (P5 bug) this 1st STARTUP would then terminate
881 * immediately, and the previously started INIT IPI would continue. OR
882 * the previous INIT IPI has already run. and this STARTUP IPI will
883 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
884 * will run.
885 */
886
887 /* do a STARTUP IPI */
888 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
889 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
890 vector, apic_id);
891 lapic_ipi_wait(-1);
892 DELAY(200); /* wait ~200uS */
893
894 /*
895 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
896 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
897 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
898 * recognized after hardware RESET or INIT IPI.
899 */
900
901 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
902 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
903 vector, apic_id);
904 lapic_ipi_wait(-1);
905 DELAY(200); /* wait ~200uS */
906
907 /* Wait up to 5 seconds for it to start. */
908 for (ms = 0; ms < 5000; ms++) {
909 if (mp_naps > cpus)
910 return 1; /* return SUCCESS */
911 DELAY(1000);
912 }
913 return 0; /* return FAILURE */
914 }
915
916 #ifdef COUNT_XINVLTLB_HITS
917 u_int xhits_gbl[MAXCPU];
918 u_int xhits_pg[MAXCPU];
919 u_int xhits_rng[MAXCPU];
920 SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
921 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
922 sizeof(xhits_gbl), "IU", "");
923 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
924 sizeof(xhits_pg), "IU", "");
925 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
926 sizeof(xhits_rng), "IU", "");
927
928 u_int ipi_global;
929 u_int ipi_page;
930 u_int ipi_range;
931 u_int ipi_range_size;
932 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
933 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
934 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
935 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
936 0, "");
937
938 u_int ipi_masked_global;
939 u_int ipi_masked_page;
940 u_int ipi_masked_range;
941 u_int ipi_masked_range_size;
942 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
943 &ipi_masked_global, 0, "");
944 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
945 &ipi_masked_page, 0, "");
946 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
947 &ipi_masked_range, 0, "");
948 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
949 &ipi_masked_range_size, 0, "");
950 #endif /* COUNT_XINVLTLB_HITS */
951
952 /*
953 * Flush the TLB on all other CPU's
954 */
955 static void
956 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
957 {
958 u_int ncpu;
959
960 ncpu = mp_ncpus - 1; /* does not shootdown self */
961 if (ncpu < 1)
962 return; /* no other cpus */
963 mtx_assert(&smp_rv_mtx, MA_OWNED);
964 smp_tlb_addr1 = addr1;
965 smp_tlb_addr2 = addr2;
966 atomic_store_rel_int(&smp_tlb_wait, 0);
967 ipi_all_but_self(vector);
968 while (smp_tlb_wait < ncpu)
969 ia32_pause();
970 }
971
972 /*
973 * This is about as magic as it gets. fortune(1) has got similar code
974 * for reversing bits in a word. Who thinks up this stuff??
975 *
976 * Yes, it does appear to be consistently faster than:
977 * while (i = ffs(m)) {
978 * m >>= i;
979 * bits++;
980 * }
981 * and
982 * while (lsb = (m & -m)) { // This is magic too
983 * m &= ~lsb; // or: m ^= lsb
984 * bits++;
985 * }
986 * Both of these latter forms do some very strange things on gcc-3.1 with
987 * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
988 * There is probably an SSE or MMX popcnt instruction.
989 *
990 * I wonder if this should be in libkern?
991 *
992 * XXX Stop the presses! Another one:
993 * static __inline u_int32_t
994 * popcnt1(u_int32_t v)
995 * {
996 * v -= ((v >> 1) & 0x55555555);
997 * v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
998 * v = (v + (v >> 4)) & 0x0F0F0F0F;
999 * return (v * 0x01010101) >> 24;
1000 * }
1001 * The downside is that it has a multiply. With a pentium3 with
1002 * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
1003 * an imull, and in that case it is faster. In most other cases
1004 * it appears slightly slower.
1005 *
1006 * Another variant (also from fortune):
1007 * #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
1008 * #define BX_(x) ((x) - (((x)>>1)&0x77777777) \
1009 * - (((x)>>2)&0x33333333) \
1010 * - (((x)>>3)&0x11111111))
1011 */
1012 static __inline u_int32_t
1013 popcnt(u_int32_t m)
1014 {
1015
1016 m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
1017 m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
1018 m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
1019 m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
1020 m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
1021 return m;
1022 }
1023
1024 static void
1025 smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1026 {
1027 int ncpu, othercpus;
1028
1029 othercpus = mp_ncpus - 1;
1030 if (mask == (u_int)-1) {
1031 ncpu = othercpus;
1032 if (ncpu < 1)
1033 return;
1034 } else {
1035 mask &= ~PCPU_GET(cpumask);
1036 if (mask == 0)
1037 return;
1038 ncpu = popcnt(mask);
1039 if (ncpu > othercpus) {
1040 /* XXX this should be a panic offence */
1041 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1042 ncpu, othercpus);
1043 ncpu = othercpus;
1044 }
1045 /* XXX should be a panic, implied by mask == 0 above */
1046 if (ncpu < 1)
1047 return;
1048 }
1049 mtx_assert(&smp_rv_mtx, MA_OWNED);
1050 smp_tlb_addr1 = addr1;
1051 smp_tlb_addr2 = addr2;
1052 atomic_store_rel_int(&smp_tlb_wait, 0);
1053 if (mask == (u_int)-1)
1054 ipi_all_but_self(vector);
1055 else
1056 ipi_selected(mask, vector);
1057 while (smp_tlb_wait < ncpu)
1058 ia32_pause();
1059 }
1060
1061 void
1062 smp_invltlb(void)
1063 {
1064 if (smp_started) {
1065 smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1066 #ifdef COUNT_XINVLTLB_HITS
1067 ipi_global++;
1068 #endif
1069 }
1070 }
1071
1072 void
1073 smp_invlpg(vm_offset_t addr)
1074 {
1075 if (smp_started) {
1076 smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1077 #ifdef COUNT_XINVLTLB_HITS
1078 ipi_page++;
1079 #endif
1080 }
1081 }
1082
1083 void
1084 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1085 {
1086 if (smp_started) {
1087 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1088 #ifdef COUNT_XINVLTLB_HITS
1089 ipi_range++;
1090 ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1091 #endif
1092 }
1093 }
1094
1095 void
1096 smp_masked_invltlb(u_int mask)
1097 {
1098 if (smp_started) {
1099 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1100 #ifdef COUNT_XINVLTLB_HITS
1101 ipi_masked_global++;
1102 #endif
1103 }
1104 }
1105
1106 void
1107 smp_masked_invlpg(u_int mask, vm_offset_t addr)
1108 {
1109 if (smp_started) {
1110 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1111 #ifdef COUNT_XINVLTLB_HITS
1112 ipi_masked_page++;
1113 #endif
1114 }
1115 }
1116
1117 void
1118 smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
1119 {
1120 if (smp_started) {
1121 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1122 #ifdef COUNT_XINVLTLB_HITS
1123 ipi_masked_range++;
1124 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
1125 #endif
1126 }
1127 }
1128
1129
1130 /*
1131 * For statclock, we send an IPI to all CPU's to have them call this
1132 * function.
1133 */
1134
1135 void
1136 forward_statclock(void)
1137 {
1138 int map;
1139
1140 CTR0(KTR_SMP, "forward_statclock");
1141
1142 if (!smp_started || cold || panicstr)
1143 return;
1144
1145 map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
1146 if (map != 0)
1147 ipi_selected(map, IPI_STATCLOCK);
1148 }
1149
1150 /*
1151 * For each hardclock(), we send an IPI to all other CPU's to have them
1152 * execute this function. It would be nice to reduce contention on
1153 * sched_lock if we could simply peek at the CPU to determine the user/kernel
1154 * state and call hardclock_process() on the CPU receiving the clock interrupt
1155 * and then just use a simple IPI to handle any ast's if needed.
1156 */
1157
1158 void
1159 forward_hardclock(void)
1160 {
1161 u_int map;
1162
1163 CTR0(KTR_SMP, "forward_hardclock");
1164
1165 if (!smp_started || cold || panicstr)
1166 return;
1167
1168 map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
1169 if (map != 0)
1170 ipi_selected(map, IPI_HARDCLOCK);
1171 }
1172
1173 void
1174 ipi_bitmap_handler(struct clockframe frame)
1175 {
1176 int cpu = PCPU_GET(cpuid);
1177 u_int ipi_bitmap;
1178 struct thread *td;
1179
1180 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1181
1182 critical_enter();
1183
1184 /* Nothing to do for AST */
1185
1186 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1187 td = curthread;
1188 td->td_intr_nesting_level++;
1189 hardclock_process(&frame);
1190 td->td_intr_nesting_level--;
1191 }
1192
1193 if (ipi_bitmap & (1 << IPI_STATCLOCK)) {
1194 CTR0(KTR_SMP, "forwarded_statclock");
1195
1196 td = curthread;
1197 td->td_intr_nesting_level++;
1198 if (profprocs != 0)
1199 profclock(&frame);
1200 if (pscnt == psdiv)
1201 statclock(&frame);
1202 td->td_intr_nesting_level--;
1203 }
1204
1205 critical_exit();
1206 }
1207
1208 /*
1209 * send an IPI to a set of cpus.
1210 */
1211 void
1212 ipi_selected(u_int32_t cpus, u_int ipi)
1213 {
1214 int cpu;
1215 u_int bitmap = 0;
1216 u_int old_pending;
1217 u_int new_pending;
1218
1219 if (IPI_IS_BITMAPED(ipi)) {
1220 bitmap = 1 << ipi;
1221 ipi = IPI_BITMAP_VECTOR;
1222 }
1223
1224 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1225 while ((cpu = ffs(cpus)) != 0) {
1226 cpu--;
1227 cpus &= ~(1 << cpu);
1228
1229 KASSERT(cpu_apic_ids[cpu] != -1,
1230 ("IPI to non-existent CPU %d", cpu));
1231
1232 if (bitmap) {
1233 do {
1234 old_pending = cpu_ipi_pending[cpu];
1235 new_pending = old_pending | bitmap;
1236 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1237
1238 if (old_pending)
1239 continue;
1240 }
1241
1242 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1243 }
1244
1245 }
1246
1247 /*
1248 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
1249 */
1250 void
1251 ipi_all(u_int ipi)
1252 {
1253
1254 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1255 lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL);
1256 }
1257
1258 /*
1259 * send an IPI to all CPUs EXCEPT myself
1260 */
1261 void
1262 ipi_all_but_self(u_int ipi)
1263 {
1264
1265 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1266 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1267 }
1268
1269 /*
1270 * send an IPI to myself
1271 */
1272 void
1273 ipi_self(u_int ipi)
1274 {
1275
1276 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1277 lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF);
1278 }
1279
1280 /*
1281 * This is called once the rest of the system is up and running and we're
1282 * ready to let the AP's out of the pen.
1283 */
1284 static void
1285 release_aps(void *dummy __unused)
1286 {
1287
1288 if (mp_ncpus == 1)
1289 return;
1290 mtx_lock_spin(&sched_lock);
1291 atomic_store_rel_int(&aps_ready, 1);
1292 while (smp_started == 0)
1293 ia32_pause();
1294 mtx_unlock_spin(&sched_lock);
1295 }
1296 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1297
1298 static int
1299 sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1300 {
1301 u_int mask;
1302 int error;
1303
1304 mask = hlt_cpus_mask;
1305 error = sysctl_handle_int(oidp, &mask, 0, req);
1306 if (error || !req->newptr)
1307 return (error);
1308
1309 if (logical_cpus_mask != 0 &&
1310 (mask & logical_cpus_mask) == logical_cpus_mask)
1311 hlt_logical_cpus = 1;
1312 else
1313 hlt_logical_cpus = 0;
1314
1315 if (! hyperthreading_allowed)
1316 mask |= hyperthreading_cpus_mask;
1317
1318 if ((mask & all_cpus) == all_cpus)
1319 mask &= ~(1<<0);
1320 hlt_cpus_mask = mask;
1321 return (error);
1322 }
1323 SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1324 0, 0, sysctl_hlt_cpus, "IU",
1325 "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2.");
1326
1327 static int
1328 sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1329 {
1330 int disable, error;
1331
1332 disable = hlt_logical_cpus;
1333 error = sysctl_handle_int(oidp, &disable, 0, req);
1334 if (error || !req->newptr)
1335 return (error);
1336
1337 if (disable)
1338 hlt_cpus_mask |= logical_cpus_mask;
1339 else
1340 hlt_cpus_mask &= ~logical_cpus_mask;
1341
1342 if (! hyperthreading_allowed)
1343 hlt_cpus_mask |= hyperthreading_cpus_mask;
1344
1345 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1346 hlt_cpus_mask &= ~(1<<0);
1347
1348 hlt_logical_cpus = disable;
1349 return (error);
1350 }
1351
1352 static int
1353 sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1354 {
1355 int allowed, error;
1356
1357 allowed = hyperthreading_allowed;
1358 error = sysctl_handle_int(oidp, &allowed, 0, req);
1359 if (error || !req->newptr)
1360 return (error);
1361
1362 if (allowed)
1363 hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1364 else
1365 hlt_cpus_mask |= hyperthreading_cpus_mask;
1366
1367 if (logical_cpus_mask != 0 &&
1368 (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1369 hlt_logical_cpus = 1;
1370 else
1371 hlt_logical_cpus = 0;
1372
1373 if ((hlt_cpus_mask & all_cpus) == all_cpus)
1374 hlt_cpus_mask &= ~(1<<0);
1375
1376 hyperthreading_allowed = allowed;
1377 return (error);
1378 }
1379
1380 static void
1381 cpu_hlt_setup(void *dummy __unused)
1382 {
1383
1384 if (logical_cpus_mask != 0) {
1385 TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1386 &hlt_logical_cpus);
1387 sysctl_ctx_init(&logical_cpu_clist);
1388 SYSCTL_ADD_PROC(&logical_cpu_clist,
1389 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1390 "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1391 sysctl_hlt_logical_cpus, "IU", "");
1392 SYSCTL_ADD_UINT(&logical_cpu_clist,
1393 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1394 "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1395 &logical_cpus_mask, 0, "");
1396
1397 if (hlt_logical_cpus)
1398 hlt_cpus_mask |= logical_cpus_mask;
1399
1400 /*
1401 * If necessary for security purposes, force
1402 * hyperthreading off, regardless of the value
1403 * of hlt_logical_cpus.
1404 */
1405 if (hyperthreading_cpus_mask) {
1406 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
1407 &hyperthreading_allowed);
1408 SYSCTL_ADD_PROC(&logical_cpu_clist,
1409 SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1410 "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1411 0, 0, sysctl_hyperthreading_allowed, "IU", "");
1412 if (! hyperthreading_allowed)
1413 hlt_cpus_mask |= hyperthreading_cpus_mask;
1414 }
1415 }
1416 }
1417 SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1418
1419 int
1420 mp_grab_cpu_hlt(void)
1421 {
1422 u_int mask = PCPU_GET(cpumask);
1423 #ifdef MP_WATCHDOG
1424 u_int cpuid = PCPU_GET(cpuid);
1425 #endif
1426 int retval;
1427
1428 #ifdef MP_WATCHDOG
1429 ap_watchdog(cpuid);
1430 #endif
1431
1432 retval = mask & hlt_cpus_mask;
1433 while (mask & hlt_cpus_mask)
1434 __asm __volatile("sti; hlt" : : : "memory");
1435 return (retval);
1436 }
Cache object: 4c7608509d80d63771b6911d23fd60ad
|