[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/mp_machdep.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD71  -  FREEBSD70  -  FREEBSD6  -  FREEBSD64  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  FREEBSD-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  excerpts  -  bigexcerpts 

  1 /*-
  2  * Copyright (c) 1996, by Steve Passe
  3  * Copyright (c) 2003, by Peter Wemm
  4  * All rights reserved.
  5  *
  6  * Redistribution and use in source and binary forms, with or without
  7  * modification, are permitted provided that the following conditions
  8  * are met:
  9  * 1. Redistributions of source code must retain the above copyright
 10  *    notice, this list of conditions and the following disclaimer.
 11  * 2. The name of the developer may NOT be used to endorse or promote products
 12  *    derived from this software without specific prior written permission.
 13  *
 14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 24  * SUCH DAMAGE.
 25  */
 26 
 27 #include <sys/cdefs.h>
 28 __FBSDID("$FreeBSD: src/sys/amd64/amd64/mp_machdep.c,v 1.295 2008/11/26 19:25:13 jkim Exp $");
 29 
 30 #include "opt_cpu.h"
 31 #include "opt_kstack_pages.h"
 32 #include "opt_mp_watchdog.h"
 33 #include "opt_sched.h"
 34 
 35 #include <sys/param.h>
 36 #include <sys/systm.h>
 37 #include <sys/bus.h>
 38 #ifdef GPROF 
 39 #include <sys/gmon.h>
 40 #endif
 41 #include <sys/kernel.h>
 42 #include <sys/ktr.h>
 43 #include <sys/lock.h>
 44 #include <sys/malloc.h>
 45 #include <sys/memrange.h>
 46 #include <sys/mutex.h>
 47 #include <sys/pcpu.h>
 48 #include <sys/proc.h>
 49 #include <sys/sched.h>
 50 #include <sys/smp.h>
 51 #include <sys/sysctl.h>
 52 
 53 #include <vm/vm.h>
 54 #include <vm/vm_param.h>
 55 #include <vm/pmap.h>
 56 #include <vm/vm_kern.h>
 57 #include <vm/vm_extern.h>
 58 
 59 #include <machine/apicreg.h>
 60 #include <machine/cputypes.h>
 61 #include <machine/md_var.h>
 62 #include <machine/mp_watchdog.h>
 63 #include <machine/pcb.h>
 64 #include <machine/psl.h>
 65 #include <machine/smp.h>
 66 #include <machine/specialreg.h>
 67 #include <machine/tss.h>
 68 
 69 #define WARMBOOT_TARGET         0
 70 #define WARMBOOT_OFF            (KERNBASE + 0x0467)
 71 #define WARMBOOT_SEG            (KERNBASE + 0x0469)
 72 
 73 #define CMOS_REG                (0x70)
 74 #define CMOS_DATA               (0x71)
 75 #define BIOS_RESET              (0x0f)
 76 #define BIOS_WARM               (0x0a)
 77 
 78 /* lock region used by kernel profiling */
 79 int     mcount_lock;
 80 
 81 int     mp_naps;                /* # of Applications processors */
 82 int     boot_cpu_id = -1;       /* designated BSP */
 83 
 84 extern  struct pcpu __pcpu[];
 85 
 86 /* AP uses this during bootstrap.  Do not staticize.  */
 87 char *bootSTK;
 88 static int bootAP;
 89 
 90 /* Free these after use */
 91 void *bootstacks[MAXCPU];
 92 
 93 /* Temporary holder for double fault stack */
 94 char *doublefault_stack;
 95 
 96 /* Hotwire a 0->4MB V==P mapping */
 97 extern pt_entry_t *KPTphys;
 98 
 99 /* SMP page table page */
100 extern pt_entry_t *SMPpt;
101 
102 extern int  _udatasel;
103 
104 struct pcb stoppcbs[MAXCPU];
105 
106 /* Variables needed for SMP tlb shootdown. */
107 vm_offset_t smp_tlb_addr1;
108 vm_offset_t smp_tlb_addr2;
109 volatile int smp_tlb_wait;
110 
111 extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
112 
113 #ifdef STOP_NMI
114 volatile cpumask_t ipi_nmi_pending;
115 
116 static void     ipi_nmi_selected(u_int32_t cpus);
117 #endif 
118 
119 /*
120  * Local data and functions.
121  */
122 
123 #ifdef STOP_NMI
124 /* 
125  * Provide an alternate method of stopping other CPUs. If another CPU has
126  * disabled interrupts the conventional STOP IPI will be blocked. This 
127  * NMI-based stop should get through in that case.
128  */
129 static int stop_cpus_with_nmi = 1;
130 SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
131     &stop_cpus_with_nmi, 0, "");
132 TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
133 #else
134 #define stop_cpus_with_nmi      0
135 #endif
136 
137 static u_int logical_cpus;
138 
139 /* used to hold the AP's until we are ready to release them */
140 static struct mtx ap_boot_mtx;
141 
142 /* Set to 1 once we're ready to let the APs out of the pen. */
143 static volatile int aps_ready = 0;
144 
145 /*
146  * Store data from cpu_add() until later in the boot when we actually setup
147  * the APs.
148  */
149 struct cpu_info {
150         int     cpu_present:1;
151         int     cpu_bsp:1;
152         int     cpu_disabled:1;
153 } static cpu_info[MAX_APIC_ID + 1];
154 int cpu_apic_ids[MAXCPU];
155 
156 /* Holds pending bitmap based IPIs per CPU */
157 static volatile u_int cpu_ipi_pending[MAXCPU];
158 
159 static u_int boot_address;
160 
161 static void     assign_cpu_ids(void);
162 static void     set_interrupt_apic_ids(void);
163 static int      start_all_aps(void);
164 static int      start_ap(int apic_id);
165 static void     release_aps(void *dummy);
166 
167 static int      hlt_logical_cpus;
168 static u_int    hyperthreading_cpus;
169 static cpumask_t        hyperthreading_cpus_mask;
170 static int      hyperthreading_allowed = 1;
171 static struct   sysctl_ctx_list logical_cpu_clist;
172 static u_int    bootMP_size;
173 
174 static void
175 mem_range_AP_init(void)
176 {
177         if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
178                 mem_range_softc.mr_op->initAP(&mem_range_softc);
179 }
180 
181 struct cpu_group *
182 cpu_topo(void)
183 {
184         if (cpu_cores == 0)
185                 cpu_cores = 1;
186         if (cpu_logical == 0)
187                 cpu_logical = 1;
188         if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
189                 printf("WARNING: Non-uniform processors.\n");
190                 printf("WARNING: Using suboptimal topology.\n");
191                 return (smp_topo_none());
192         }
193         /*
194          * No multi-core or hyper-threaded.
195          */
196         if (cpu_logical * cpu_cores == 1)
197                 return (smp_topo_none());
198         /*
199          * Only HTT no multi-core.
200          */
201         if (cpu_logical > 1 && cpu_cores == 1)
202                 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
203         /*
204          * Only multi-core no HTT.
205          */
206         if (cpu_cores > 1 && cpu_logical == 1)
207                 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
208         /*
209          * Both HTT and multi-core.
210          */
211         return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
212             CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
213 }
214 
215 /*
216  * Calculate usable address in base memory for AP trampoline code.
217  */
218 u_int
219 mp_bootaddress(u_int basemem)
220 {
221 
222         bootMP_size = mptramp_end - mptramp_start;
223         boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */
224         if (((basemem * 1024) - boot_address) < bootMP_size)
225                 boot_address -= PAGE_SIZE;      /* not enough, lower by 4k */
226         /* 3 levels of page table pages */
227         mptramp_pagetables = boot_address - (PAGE_SIZE * 3);
228 
229         return mptramp_pagetables;
230 }
231 
232 void
233 cpu_add(u_int apic_id, char boot_cpu)
234 {
235 
236         if (apic_id > MAX_APIC_ID) {
237                 panic("SMP: APIC ID %d too high", apic_id);
238                 return;
239         }
240         KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
241             apic_id));
242         cpu_info[apic_id].cpu_present = 1;
243         if (boot_cpu) {
244                 KASSERT(boot_cpu_id == -1,
245                     ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
246                     boot_cpu_id));
247                 boot_cpu_id = apic_id;
248                 cpu_info[apic_id].cpu_bsp = 1;
249         }
250         if (mp_ncpus < MAXCPU) {
251                 mp_ncpus++;
252                 mp_maxid = mp_ncpus -1;
253         }
254         if (bootverbose)
255                 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
256                     "AP");
257 }
258 
259 void
260 cpu_mp_setmaxid(void)
261 {
262 
263         /*
264          * mp_maxid should be already set by calls to cpu_add().
265          * Just sanity check its value here.
266          */
267         if (mp_ncpus == 0)
268                 KASSERT(mp_maxid == 0,
269                     ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
270         else if (mp_ncpus == 1)
271                 mp_maxid = 0;
272         else
273                 KASSERT(mp_maxid >= mp_ncpus - 1,
274                     ("%s: counters out of sync: max %d, count %d", __func__,
275                         mp_maxid, mp_ncpus));           
276 }
277 
278 int
279 cpu_mp_probe(void)
280 {
281 
282         /*
283          * Always record BSP in CPU map so that the mbuf init code works
284          * correctly.
285          */
286         all_cpus = 1;
287         if (mp_ncpus == 0) {
288                 /*
289                  * No CPUs were found, so this must be a UP system.  Setup
290                  * the variables to represent a system with a single CPU
291                  * with an id of 0.
292                  */
293                 mp_ncpus = 1;
294                 return (0);
295         }
296 
297         /* At least one CPU was found. */
298         if (mp_ncpus == 1) {
299                 /*
300                  * One CPU was found, so this must be a UP system with
301                  * an I/O APIC.
302                  */
303                 mp_maxid = 0;
304                 return (0);
305         }
306 
307         /* At least two CPUs were found. */
308         return (1);
309 }
310 
311 /*
312  * Initialize the IPI handlers and start up the AP's.
313  */
314 void
315 cpu_mp_start(void)
316 {
317         int i;
318         u_int threads_per_cache, p[4];
319 
320         /* Initialize the logical ID to APIC ID table. */
321         for (i = 0; i < MAXCPU; i++) {
322                 cpu_apic_ids[i] = -1;
323                 cpu_ipi_pending[i] = 0;
324         }
325 
326         /* Install an inter-CPU IPI for TLB invalidation */
327         setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
328         setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
329         setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
330 
331         /* Install an inter-CPU IPI for cache invalidation. */
332         setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
333 
334         /* Install an inter-CPU IPI for all-CPU rendezvous */
335         setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
336 
337         /* Install generic inter-CPU IPI handler */
338         setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
339                SDT_SYSIGT, SEL_KPL, 0);
340 
341         /* Install an inter-CPU IPI for CPU stop/restart */
342         setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
343 
344         /* Set boot_cpu_id if needed. */
345         if (boot_cpu_id == -1) {
346                 boot_cpu_id = PCPU_GET(apic_id);
347                 cpu_info[boot_cpu_id].cpu_bsp = 1;
348         } else
349                 KASSERT(boot_cpu_id == PCPU_GET(apic_id),
350                     ("BSP's APIC ID doesn't match boot_cpu_id"));
351         cpu_apic_ids[0] = boot_cpu_id;
352 
353         assign_cpu_ids();
354 
355         /* Start each Application Processor */
356         start_all_aps();
357 
358         /* Setup the initial logical CPUs info. */
359         logical_cpus = logical_cpus_mask = 0;
360         if (cpu_feature & CPUID_HTT)
361                 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
362 
363         /*
364          * Work out if hyperthreading is *really* enabled.  This
365          * is made really ugly by the fact that processors lie: Dual
366          * core processors claim to be hyperthreaded even when they're
367          * not, presumably because they want to be treated the same
368          * way as HTT with respect to per-cpu software licensing.
369          * At the time of writing (May 12, 2005) the only hyperthreaded
370          * cpus are from Intel, and Intel's dual-core processors can be
371          * identified via the "deterministic cache parameters" cpuid
372          * calls.
373          */
374         /*
375          * First determine if this is an Intel processor which claims
376          * to have hyperthreading support.
377          */
378         if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_INTEL) {
379                 /*
380                  * If the "deterministic cache parameters" cpuid calls
381                  * are available, use them.
382                  */
383                 if (cpu_high >= 4) {
384                         /* Ask the processor about the L1 cache. */
385                         for (i = 0; i < 1; i++) {
386                                 cpuid_count(4, i, p);
387                                 threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
388                                 if (hyperthreading_cpus < threads_per_cache)
389                                         hyperthreading_cpus = threads_per_cache;
390                                 if ((p[0] & 0x1f) == 0)
391                                         break;
392                         }
393                 }
394 
395                 /*
396                  * If the deterministic cache parameters are not
397                  * available, or if no caches were reported to exist,
398                  * just accept what the HTT flag indicated.
399                  */
400                 if (hyperthreading_cpus == 0)
401                         hyperthreading_cpus = logical_cpus;
402         }
403 
404         set_interrupt_apic_ids();
405 }
406 
407 
408 /*
409  * Print various information about the SMP system hardware and setup.
410  */
411 void
412 cpu_mp_announce(void)
413 {
414         int i, x;
415 
416         /* List CPUs */
417         printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
418         for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
419                 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
420                         continue;
421                 if (cpu_info[x].cpu_disabled)
422                         printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
423                 else {
424                         KASSERT(i < mp_ncpus,
425                             ("mp_ncpus and actual cpus are out of whack"));
426                         printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
427                 }
428         }
429 }
430 
431 /*
432  * AP CPU's call this to initialize themselves.
433  */
434 void
435 init_secondary(void)
436 {
437         struct pcpu *pc;
438         u_int64_t msr, cr0;
439         int cpu, gsel_tss, x;
440         struct region_descriptor ap_gdt;
441 
442         /* Set by the startup code for us to use */
443         cpu = bootAP;
444 
445         /* Init tss */
446         common_tss[cpu] = common_tss[0];
447         common_tss[cpu].tss_rsp0 = 0;   /* not used until after switch */
448         common_tss[cpu].tss_iobase = sizeof(struct amd64tss);
449         common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
450 
451         /* Prepare private GDT */
452         gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
453         ssdtosyssd(&gdt_segs[GPROC0_SEL],
454            (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]);
455         for (x = 0; x < NGDT; x++) {
456                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
457                         ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]);
458         }
459         ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
460         ap_gdt.rd_base =  (long) &gdt[NGDT * cpu];
461         lgdt(&ap_gdt);                  /* does magic intra-segment return */
462 
463         /* Get per-cpu data */
464         pc = &__pcpu[cpu];
465 
466         /* prime data page for it to use */
467         pcpu_init(pc, cpu, sizeof(struct pcpu));
468         pc->pc_apic_id = cpu_apic_ids[cpu];
469         pc->pc_prvspace = pc;
470         pc->pc_curthread = 0;
471         pc->pc_tssp = &common_tss[cpu];
472         pc->pc_rsp0 = 0;
473         pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
474 
475         wrmsr(MSR_FSBASE, 0);           /* User value */
476         wrmsr(MSR_GSBASE, (u_int64_t)pc);
477         wrmsr(MSR_KGSBASE, (u_int64_t)pc);      /* XXX User value while we're in the kernel */
478 
479         lidt(&r_idt);
480 
481         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
482         ltr(gsel_tss);
483 
484         /*
485          * Set to a known state:
486          * Set by mpboot.s: CR0_PG, CR0_PE
487          * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
488          */
489         cr0 = rcr0();
490         cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
491         load_cr0(cr0);
492 
493         /* Set up the fast syscall stuff */
494         msr = rdmsr(MSR_EFER) | EFER_SCE;
495         wrmsr(MSR_EFER, msr);
496         wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
497         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
498         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
499               ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
500         wrmsr(MSR_STAR, msr);
501         wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
502 
503         /* Disable local APIC just to be sure. */
504         lapic_disable();
505 
506         /* signal our startup to the BSP. */
507         mp_naps++;
508 
509         /* Spin until the BSP releases the AP's. */
510         while (!aps_ready)
511                 ia32_pause();
512 
513         /* Initialize the PAT MSR. */
514         pmap_init_pat();
515 
516         /* set up CPU registers and state */
517         cpu_setregs();
518 
519         /* set up SSE/NX registers */
520         initializecpu();
521 
522         /* set up FPU state on the AP */
523         fpuinit();
524 
525         /* A quick check from sanity claus */
526         if (PCPU_GET(apic_id) != lapic_id()) {
527                 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
528                 printf("SMP: actual apic_id = %d\n", lapic_id());
529                 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
530                 panic("cpuid mismatch! boom!!");
531         }
532 
533         /* Initialize curthread. */
534         KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
535         PCPU_SET(curthread, PCPU_GET(idlethread));
536 
537         mtx_lock_spin(&ap_boot_mtx);
538 
539         /* Init local apic for irq's */
540         lapic_setup(1);
541 
542         /* Set memory range attributes for this CPU to match the BSP */
543         mem_range_AP_init();
544 
545         smp_cpus++;
546 
547         CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
548         printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
549 
550         /* Determine if we are a logical CPU. */
551         if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
552                 logical_cpus_mask |= PCPU_GET(cpumask);
553         
554         /* Determine if we are a hyperthread. */
555         if (hyperthreading_cpus > 1 &&
556             PCPU_GET(apic_id) % hyperthreading_cpus != 0)
557                 hyperthreading_cpus_mask |= PCPU_GET(cpumask);
558 
559         /* Build our map of 'other' CPUs. */
560         PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
561 
562         if (bootverbose)
563                 lapic_dump("AP");
564 
565         if (smp_cpus == mp_ncpus) {
566                 /* enable IPI's, tlb shootdown, freezes etc */
567                 atomic_store_rel_int(&smp_started, 1);
568                 smp_active = 1;  /* historic */
569         }
570 
571         /*
572          * Enable global pages TLB extension
573          * This also implicitly flushes the TLB 
574          */
575 
576         load_cr4(rcr4() | CR4_PGE);
577         load_ds(_udatasel);
578         load_es(_udatasel);
579         load_fs(_udatasel);
580         mtx_unlock_spin(&ap_boot_mtx);
581 
582         /* wait until all the AP's are up */
583         while (smp_started == 0)
584                 ia32_pause();
585 
586         sched_throw(NULL);
587 
588         panic("scheduler returned us to %s", __func__);
589         /* NOTREACHED */
590 }
591 
592 /*******************************************************************
593  * local functions and data
594  */
595 
596 /*
597  * We tell the I/O APIC code about all the CPUs we want to receive
598  * interrupts.  If we don't want certain CPUs to receive IRQs we
599  * can simply not tell the I/O APIC code about them in this function.
600  * We also do not tell it about the BSP since it tells itself about
601  * the BSP internally to work with UP kernels and on UP machines.
602  */
603 static void
604 set_interrupt_apic_ids(void)
605 {
606         u_int i, apic_id;
607 
608         for (i = 0; i < MAXCPU; i++) {
609                 apic_id = cpu_apic_ids[i];
610                 if (apic_id == -1)
611                         continue;
612                 if (cpu_info[apic_id].cpu_bsp)
613                         continue;
614                 if (cpu_info[apic_id].cpu_disabled)
615                         continue;
616 
617                 /* Don't let hyperthreads service interrupts. */
618                 if (hyperthreading_cpus > 1 &&
619                     apic_id % hyperthreading_cpus != 0)
620                         continue;
621 
622                 intr_add_cpu(i);
623         }
624 }
625 
626 /*
627  * Assign logical CPU IDs to local APICs.
628  */
629 static void
630 assign_cpu_ids(void)
631 {
632         u_int i;
633 
634         /* Check for explicitly disabled CPUs. */
635         for (i = 0; i <= MAX_APIC_ID; i++) {
636                 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
637                         continue;
638 
639                 /* Don't use this CPU if it has been disabled by a tunable. */
640                 if (resource_disabled("lapic", i)) {
641                         cpu_info[i].cpu_disabled = 1;
642                         continue;
643                 }
644         }
645 
646         /*
647          * Assign CPU IDs to local APIC IDs and disable any CPUs
648          * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
649          * so we only have to assign IDs for APs.
650          */
651         mp_ncpus = 1;
652         for (i = 0; i <= MAX_APIC_ID; i++) {
653                 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
654                     cpu_info[i].cpu_disabled)
655                         continue;
656 
657                 if (mp_ncpus < MAXCPU) {
658                         cpu_apic_ids[mp_ncpus] = i;
659                         mp_ncpus++;
660                 } else
661                         cpu_info[i].cpu_disabled = 1;
662         }
663         KASSERT(mp_maxid >= mp_ncpus - 1,
664             ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
665             mp_ncpus));         
666 }
667 
668 /*
669  * start each AP in our list
670  */
671 static int
672 start_all_aps(void)
673 {
674         vm_offset_t va = boot_address + KERNBASE;
675         u_int64_t *pt4, *pt3, *pt2;
676         u_int32_t mpbioswarmvec;
677         int apic_id, cpu, i;
678         u_char mpbiosreason;
679 
680         mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
681 
682         /* install the AP 1st level boot code */
683         pmap_kenter(va, boot_address);
684         pmap_invalidate_page(kernel_pmap, va);
685         bcopy(mptramp_start, (void *)va, bootMP_size);
686 
687         /* Locate the page tables, they'll be below the trampoline */
688         pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE);
689         pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
690         pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
691 
692         /* Create the initial 1GB replicated page tables */
693         for (i = 0; i < 512; i++) {
694                 /* Each slot of the level 4 pages points to the same level 3 page */
695                 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
696                 pt4[i] |= PG_V | PG_RW | PG_U;
697 
698                 /* Each slot of the level 3 pages points to the same level 2 page */
699                 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
700                 pt3[i] |= PG_V | PG_RW | PG_U;
701 
702                 /* The level 2 page slots are mapped with 2MB pages for 1GB. */
703                 pt2[i] = i * (2 * 1024 * 1024);
704                 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
705         }
706 
707         /* save the current value of the warm-start vector */
708         mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
709         outb(CMOS_REG, BIOS_RESET);
710         mpbiosreason = inb(CMOS_DATA);
711 
712         /* setup a vector to our boot code */
713         *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
714         *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
715         outb(CMOS_REG, BIOS_RESET);
716         outb(CMOS_DATA, BIOS_WARM);     /* 'warm-start' */
717 
718         /* start each AP */
719         for (cpu = 1; cpu < mp_ncpus; cpu++) {
720                 apic_id = cpu_apic_ids[cpu];
721 
722                 /* allocate and set up an idle stack data page */
723                 bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
724                 doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
725 
726                 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
727                 bootAP = cpu;
728 
729                 /* attempt to start the Application Processor */
730                 if (!start_ap(apic_id)) {
731                         /* restore the warmstart vector */
732                         *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
733                         panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
734                 }
735 
736                 all_cpus |= (1 << cpu);         /* record AP in CPU map */
737         }
738 
739         /* build our map of 'other' CPUs */
740         PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
741 
742         /* restore the warmstart vector */
743         *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
744 
745         outb(CMOS_REG, BIOS_RESET);
746         outb(CMOS_DATA, mpbiosreason);
747 
748         /* number of APs actually started */
749         return mp_naps;
750 }
751 
752 
753 /*
754  * This function starts the AP (application processor) identified
755  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
756  * to accomplish this.  This is necessary because of the nuances
757  * of the different hardware we might encounter.  It isn't pretty,
758  * but it seems to work.
759  */
760 static int
761 start_ap(int apic_id)
762 {
763         int vector, ms;
764         int cpus;
765 
766         /* calculate the vector */
767         vector = (boot_address >> 12) & 0xff;
768 
769         /* used as a watchpoint to signal AP startup */
770         cpus = mp_naps;
771 
772         /*
773          * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
774          * and running the target CPU. OR this INIT IPI might be latched (P5
775          * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
776          * ignored.
777          */
778 
779         /* do an INIT IPI: assert RESET */
780         lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
781             APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
782 
783         /* wait for pending status end */
784         lapic_ipi_wait(-1);
785 
786         /* do an INIT IPI: deassert RESET */
787         lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
788             APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
789 
790         /* wait for pending status end */
791         DELAY(10000);           /* wait ~10mS */
792         lapic_ipi_wait(-1);
793 
794         /*
795          * next we do a STARTUP IPI: the previous INIT IPI might still be
796          * latched, (P5 bug) this 1st STARTUP would then terminate
797          * immediately, and the previously started INIT IPI would continue. OR
798          * the previous INIT IPI has already run. and this STARTUP IPI will
799          * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
800          * will run.
801          */
802 
803         /* do a STARTUP IPI */
804         lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
805             APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
806             vector, apic_id);
807         lapic_ipi_wait(-1);
808         DELAY(200);             /* wait ~200uS */
809 
810         /*
811          * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
812          * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
813          * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
814          * recognized after hardware RESET or INIT IPI.
815          */
816 
817         lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
818             APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
819             vector, apic_id);
820         lapic_ipi_wait(-1);
821         DELAY(200);             /* wait ~200uS */
822 
823         /* Wait up to 5 seconds for it to start. */
824         for (ms = 0; ms < 5000; ms++) {
825                 if (mp_naps > cpus)
826                         return 1;       /* return SUCCESS */
827                 DELAY(1000);
828         }
829         return 0;               /* return FAILURE */
830 }
831 
832 /*
833  * Flush the TLB on all other CPU's
834  */
835 static void
836 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
837 {
838         u_int ncpu;
839 
840         ncpu = mp_ncpus - 1;    /* does not shootdown self */
841         if (ncpu < 1)
842                 return;         /* no other cpus */
843         if (!(read_rflags() & PSL_I))
844                 panic("%s: interrupts disabled", __func__);
845         mtx_lock_spin(&smp_ipi_mtx);
846         smp_tlb_addr1 = addr1;
847         smp_tlb_addr2 = addr2;
848         atomic_store_rel_int(&smp_tlb_wait, 0);
849         ipi_all_but_self(vector);
850         while (smp_tlb_wait < ncpu)
851                 ia32_pause();
852         mtx_unlock_spin(&smp_ipi_mtx);
853 }
854 
855 static void
856 smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
857 {
858         int ncpu, othercpus;
859 
860         othercpus = mp_ncpus - 1;
861         if (mask == (u_int)-1) {
862                 ncpu = othercpus;
863                 if (ncpu < 1)
864                         return;
865         } else {
866                 mask &= ~PCPU_GET(cpumask);
867                 if (mask == 0)
868                         return;
869                 ncpu = bitcount32(mask);
870                 if (ncpu > othercpus) {
871                         /* XXX this should be a panic offence */
872                         printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
873                             ncpu, othercpus);
874                         ncpu = othercpus;
875                 }
876                 /* XXX should be a panic, implied by mask == 0 above */
877                 if (ncpu < 1)
878                         return;
879         }
880         if (!(read_rflags() & PSL_I))
881                 panic("%s: interrupts disabled", __func__);
882         mtx_lock_spin(&smp_ipi_mtx);
883         smp_tlb_addr1 = addr1;
884         smp_tlb_addr2 = addr2;
885         atomic_store_rel_int(&smp_tlb_wait, 0);
886         if (mask == (u_int)-1)
887                 ipi_all_but_self(vector);
888         else
889                 ipi_selected(mask, vector);
890         while (smp_tlb_wait < ncpu)
891                 ia32_pause();
892         mtx_unlock_spin(&smp_ipi_mtx);
893 }
894 
895 void
896 smp_cache_flush(void)
897 {
898 
899         if (smp_started)
900                 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
901 }
902 
903 void
904 smp_invltlb(void)
905 {
906 
907         if (smp_started) {
908                 smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
909         }
910 }
911 
912 void
913 smp_invlpg(vm_offset_t addr)
914 {
915 
916         if (smp_started)
917                 smp_tlb_shootdown(IPI_INVLPG, addr, 0);
918 }
919 
920 void
921 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
922 {
923 
924         if (smp_started) {
925                 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
926         }
927 }
928 
929 void
930 smp_masked_invltlb(u_int mask)
931 {
932 
933         if (smp_started) {
934                 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
935         }
936 }
937 
938 void
939 smp_masked_invlpg(u_int mask, vm_offset_t addr)
940 {
941 
942         if (smp_started) {
943                 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
944         }
945 }
946 
947 void
948 smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
949 {
950 
951         if (smp_started) {
952                 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
953         }
954 }
955 
956 void
957 ipi_bitmap_handler(struct trapframe frame)
958 {
959         int cpu = PCPU_GET(cpuid);
960         u_int ipi_bitmap;
961 
962         ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
963 
964         if (ipi_bitmap & (1 << IPI_PREEMPT))
965                 sched_preempt(curthread);
966 
967         /* Nothing to do for AST */
968 }
969 
970 /*
971  * send an IPI to a set of cpus.
972  */
973 void
974 ipi_selected(u_int32_t cpus, u_int ipi)
975 {
976         int cpu;
977         u_int bitmap = 0;
978         u_int old_pending;
979         u_int new_pending;
980 
981         if (IPI_IS_BITMAPED(ipi)) { 
982                 bitmap = 1 << ipi;
983                 ipi = IPI_BITMAP_VECTOR;
984         }
985 
986 #ifdef STOP_NMI
98