1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2008, by Kip Macy
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 * derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include "opt_apic.h"
31 #include "opt_cpu.h"
32 #include "opt_kstack_pages.h"
33 #include "opt_mp_watchdog.h"
34 #include "opt_pmap.h"
35 #include "opt_sched.h"
36 #include "opt_smp.h"
37
38 #if !defined(lint)
39 #if !defined(SMP)
40 #error How did you get here?
41 #endif
42
43 #ifndef DEV_APIC
44 #error The apic device is required for SMP, add "device apic" to your config file.
45 #endif
46 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
47 #error SMP not supported with CPU_DISABLE_CMPXCHG
48 #endif
49 #endif /* not lint */
50
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/bus.h>
54 #include <sys/cons.h> /* cngetc() */
55 #include <sys/cpuset.h>
56 #ifdef GPROF
57 #include <sys/gmon.h>
58 #endif
59 #include <sys/kernel.h>
60 #include <sys/ktr.h>
61 #include <sys/lock.h>
62 #include <sys/malloc.h>
63 #include <sys/memrange.h>
64 #include <sys/mutex.h>
65 #include <sys/pcpu.h>
66 #include <sys/proc.h>
67 #include <sys/sched.h>
68 #include <sys/smp.h>
69 #include <sys/sysctl.h>
70
71 #include <vm/vm.h>
72 #include <vm/vm_param.h>
73 #include <vm/pmap.h>
74 #include <vm/vm_kern.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_page.h>
77
78 #include <x86/apicreg.h>
79 #include <machine/md_var.h>
80 #include <machine/mp_watchdog.h>
81 #include <machine/pcb.h>
82 #include <machine/psl.h>
83 #include <machine/smp.h>
84 #include <machine/specialreg.h>
85 #include <machine/pcpu.h>
86
87
88
89 #include <machine/xen/xen-os.h>
90 #include <xen/evtchn.h>
91 #include <xen/xen_intr.h>
92 #include <xen/hypervisor.h>
93 #include <xen/interface/vcpu.h>
94
95
96 int mp_naps; /* # of Applications processors */
97 int boot_cpu_id = -1; /* designated BSP */
98
99 extern struct pcpu __pcpu[];
100
101 static int bootAP;
102 static union descriptor *bootAPgdt;
103
104 static char resched_name[NR_CPUS][15];
105 static char callfunc_name[NR_CPUS][15];
106
107 /* Free these after use */
108 void *bootstacks[MAXCPU];
109
110 struct pcb stoppcbs[MAXCPU];
111
112 /* Variables needed for SMP tlb shootdown. */
113 vm_offset_t smp_tlb_addr1;
114 vm_offset_t smp_tlb_addr2;
115 volatile int smp_tlb_wait;
116
117 typedef void call_data_func_t(uintptr_t , uintptr_t);
118
119 static u_int logical_cpus;
120 static volatile cpuset_t ipi_nmi_pending;
121
122 /* used to hold the AP's until we are ready to release them */
123 static struct mtx ap_boot_mtx;
124
125 /* Set to 1 once we're ready to let the APs out of the pen. */
126 static volatile int aps_ready = 0;
127
128 /*
129 * Store data from cpu_add() until later in the boot when we actually setup
130 * the APs.
131 */
132 struct cpu_info {
133 int cpu_present:1;
134 int cpu_bsp:1;
135 int cpu_disabled:1;
136 } static cpu_info[MAX_APIC_ID + 1];
137 int cpu_apic_ids[MAXCPU];
138 int apic_cpuids[MAX_APIC_ID + 1];
139
140 /* Holds pending bitmap based IPIs per CPU */
141 static volatile u_int cpu_ipi_pending[MAXCPU];
142
143 static int cpu_logical;
144 static int cpu_cores;
145
146 static void assign_cpu_ids(void);
147 static void set_interrupt_apic_ids(void);
148 int start_all_aps(void);
149 static int start_ap(int apic_id);
150 static void release_aps(void *dummy);
151
152 static u_int hyperthreading_cpus;
153 static cpuset_t hyperthreading_cpus_mask;
154
155 extern void Xhypervisor_callback(void);
156 extern void failsafe_callback(void);
157 extern void pmap_lazyfix_action(void);
158
159 struct cpu_group *
160 cpu_topo(void)
161 {
162 if (cpu_cores == 0)
163 cpu_cores = 1;
164 if (cpu_logical == 0)
165 cpu_logical = 1;
166 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
167 printf("WARNING: Non-uniform processors.\n");
168 printf("WARNING: Using suboptimal topology.\n");
169 return (smp_topo_none());
170 }
171 /*
172 * No multi-core or hyper-threaded.
173 */
174 if (cpu_logical * cpu_cores == 1)
175 return (smp_topo_none());
176 /*
177 * Only HTT no multi-core.
178 */
179 if (cpu_logical > 1 && cpu_cores == 1)
180 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
181 /*
182 * Only multi-core no HTT.
183 */
184 if (cpu_cores > 1 && cpu_logical == 1)
185 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
186 /*
187 * Both HTT and multi-core.
188 */
189 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
190 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
191 }
192
193 /*
194 * Calculate usable address in base memory for AP trampoline code.
195 */
196 u_int
197 mp_bootaddress(u_int basemem)
198 {
199
200 return (basemem);
201 }
202
203 void
204 cpu_add(u_int apic_id, char boot_cpu)
205 {
206
207 if (apic_id > MAX_APIC_ID) {
208 panic("SMP: APIC ID %d too high", apic_id);
209 return;
210 }
211 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
212 apic_id));
213 cpu_info[apic_id].cpu_present = 1;
214 if (boot_cpu) {
215 KASSERT(boot_cpu_id == -1,
216 ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
217 boot_cpu_id));
218 boot_cpu_id = apic_id;
219 cpu_info[apic_id].cpu_bsp = 1;
220 }
221 if (mp_ncpus < MAXCPU)
222 mp_ncpus++;
223 if (bootverbose)
224 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
225 "AP");
226 }
227
228 void
229 cpu_mp_setmaxid(void)
230 {
231
232 mp_maxid = MAXCPU - 1;
233 }
234
235 int
236 cpu_mp_probe(void)
237 {
238
239 /*
240 * Always record BSP in CPU map so that the mbuf init code works
241 * correctly.
242 */
243 CPU_SETOF(0, &all_cpus);
244 if (mp_ncpus == 0) {
245 /*
246 * No CPUs were found, so this must be a UP system. Setup
247 * the variables to represent a system with a single CPU
248 * with an id of 0.
249 */
250 mp_ncpus = 1;
251 return (0);
252 }
253
254 /* At least one CPU was found. */
255 if (mp_ncpus == 1) {
256 /*
257 * One CPU was found, so this must be a UP system with
258 * an I/O APIC.
259 */
260 return (0);
261 }
262
263 /* At least two CPUs were found. */
264 return (1);
265 }
266
267 /*
268 * Initialize the IPI handlers and start up the AP's.
269 */
270 void
271 cpu_mp_start(void)
272 {
273 int i;
274
275 /* Initialize the logical ID to APIC ID table. */
276 for (i = 0; i < MAXCPU; i++) {
277 cpu_apic_ids[i] = -1;
278 cpu_ipi_pending[i] = 0;
279 }
280
281 /* Set boot_cpu_id if needed. */
282 if (boot_cpu_id == -1) {
283 boot_cpu_id = PCPU_GET(apic_id);
284 cpu_info[boot_cpu_id].cpu_bsp = 1;
285 } else
286 KASSERT(boot_cpu_id == PCPU_GET(apic_id),
287 ("BSP's APIC ID doesn't match boot_cpu_id"));
288 cpu_apic_ids[0] = boot_cpu_id;
289 apic_cpuids[boot_cpu_id] = 0;
290
291 assign_cpu_ids();
292
293 /* Start each Application Processor */
294 start_all_aps();
295
296 /* Setup the initial logical CPUs info. */
297 logical_cpus = 0;
298 CPU_ZERO(&logical_cpus_mask);
299 if (cpu_feature & CPUID_HTT)
300 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
301
302 set_interrupt_apic_ids();
303 }
304
305
306 static void
307 iv_rendezvous(uintptr_t a, uintptr_t b)
308 {
309 smp_rendezvous_action();
310 }
311
312 static void
313 iv_invltlb(uintptr_t a, uintptr_t b)
314 {
315 xen_tlb_flush();
316 }
317
318 static void
319 iv_invlpg(uintptr_t a, uintptr_t b)
320 {
321 xen_invlpg(a);
322 }
323
324 static void
325 iv_invlrng(uintptr_t a, uintptr_t b)
326 {
327 vm_offset_t start = (vm_offset_t)a;
328 vm_offset_t end = (vm_offset_t)b;
329
330 while (start < end) {
331 xen_invlpg(start);
332 start += PAGE_SIZE;
333 }
334 }
335
336
337 static void
338 iv_invlcache(uintptr_t a, uintptr_t b)
339 {
340
341 wbinvd();
342 atomic_add_int(&smp_tlb_wait, 1);
343 }
344
345 static void
346 iv_lazypmap(uintptr_t a, uintptr_t b)
347 {
348 pmap_lazyfix_action();
349 atomic_add_int(&smp_tlb_wait, 1);
350 }
351
352 /*
353 * These start from "IPI offset" APIC_IPI_INTS
354 */
355 static call_data_func_t *ipi_vectors[6] =
356 {
357 iv_rendezvous,
358 iv_invltlb,
359 iv_invlpg,
360 iv_invlrng,
361 iv_invlcache,
362 iv_lazypmap,
363 };
364
365 /*
366 * Reschedule call back. Nothing to do,
367 * all the work is done automatically when
368 * we return from the interrupt.
369 */
370 static int
371 smp_reschedule_interrupt(void *unused)
372 {
373 int cpu = PCPU_GET(cpuid);
374 u_int ipi_bitmap;
375
376 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
377
378 if (ipi_bitmap & (1 << IPI_PREEMPT)) {
379 #ifdef COUNT_IPIS
380 (*ipi_preempt_counts[cpu])++;
381 #endif
382 sched_preempt(curthread);
383 }
384
385 if (ipi_bitmap & (1 << IPI_AST)) {
386 #ifdef COUNT_IPIS
387 (*ipi_ast_counts[cpu])++;
388 #endif
389 /* Nothing to do for AST */
390 }
391 return (FILTER_HANDLED);
392 }
393
394 struct _call_data {
395 uint16_t func_id;
396 uint16_t wait;
397 uintptr_t arg1;
398 uintptr_t arg2;
399 atomic_t started;
400 atomic_t finished;
401 };
402
403 static struct _call_data *call_data;
404
405 static int
406 smp_call_function_interrupt(void *unused)
407 {
408 call_data_func_t *func;
409 uintptr_t arg1 = call_data->arg1;
410 uintptr_t arg2 = call_data->arg2;
411 int wait = call_data->wait;
412 atomic_t *started = &call_data->started;
413 atomic_t *finished = &call_data->finished;
414
415 /* We only handle function IPIs, not bitmap IPIs */
416 if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR)
417 panic("invalid function id %u", call_data->func_id);
418
419 func = ipi_vectors[call_data->func_id - APIC_IPI_INTS];
420 /*
421 * Notify initiating CPU that I've grabbed the data and am
422 * about to execute the function
423 */
424 mb();
425 atomic_inc(started);
426 /*
427 * At this point the info structure may be out of scope unless wait==1
428 */
429 (*func)(arg1, arg2);
430
431 if (wait) {
432 mb();
433 atomic_inc(finished);
434 }
435 atomic_add_int(&smp_tlb_wait, 1);
436 return (FILTER_HANDLED);
437 }
438
439 /*
440 * Print various information about the SMP system hardware and setup.
441 */
442 void
443 cpu_mp_announce(void)
444 {
445 int i, x;
446
447 /* List CPUs */
448 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
449 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
450 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
451 continue;
452 if (cpu_info[x].cpu_disabled)
453 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x);
454 else {
455 KASSERT(i < mp_ncpus,
456 ("mp_ncpus and actual cpus are out of whack"));
457 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
458 }
459 }
460 }
461
462 static int
463 xen_smp_intr_init(unsigned int cpu)
464 {
465 int rc;
466 unsigned int irq;
467
468 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
469
470 sprintf(resched_name[cpu], "resched%u", cpu);
471 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
472 cpu,
473 resched_name[cpu],
474 smp_reschedule_interrupt,
475 INTR_TYPE_TTY, &irq);
476
477 printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n",
478 cpu, irq, RESCHEDULE_VECTOR);
479
480 per_cpu(resched_irq, cpu) = irq;
481
482 sprintf(callfunc_name[cpu], "callfunc%u", cpu);
483 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
484 cpu,
485 callfunc_name[cpu],
486 smp_call_function_interrupt,
487 INTR_TYPE_TTY, &irq);
488 if (rc < 0)
489 goto fail;
490 per_cpu(callfunc_irq, cpu) = irq;
491
492 printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n",
493 cpu, irq, CALL_FUNCTION_VECTOR);
494
495
496 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
497 goto fail;
498
499 return 0;
500
501 fail:
502 if (per_cpu(resched_irq, cpu) >= 0)
503 unbind_from_irqhandler(per_cpu(resched_irq, cpu));
504 if (per_cpu(callfunc_irq, cpu) >= 0)
505 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
506 return rc;
507 }
508
509 static void
510 xen_smp_intr_init_cpus(void *unused)
511 {
512 int i;
513
514 for (i = 0; i < mp_ncpus; i++)
515 xen_smp_intr_init(i);
516 }
517
518 #define MTOPSIZE (1<<(14 + PAGE_SHIFT))
519
520 /*
521 * AP CPU's call this to initialize themselves.
522 */
523 void
524 init_secondary(void)
525 {
526 vm_offset_t addr;
527 u_int cpuid;
528 int gsel_tss;
529
530
531 /* bootAP is set in start_ap() to our ID. */
532 PCPU_SET(currentldt, _default_ldt);
533 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
534 #if 0
535 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
536 #endif
537 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
538 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
539 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
540 #if 0
541 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
542
543 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
544 #endif
545 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
546
547 /*
548 * Set to a known state:
549 * Set by mpboot.s: CR0_PG, CR0_PE
550 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
551 */
552 /*
553 * signal our startup to the BSP.
554 */
555 mp_naps++;
556
557 /* Spin until the BSP releases the AP's. */
558 while (!aps_ready)
559 ia32_pause();
560
561 /* BSP may have changed PTD while we were waiting */
562 invltlb();
563 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
564 invlpg(addr);
565
566 /* set up FPU state on the AP */
567 npxinit();
568 #if 0
569
570 /* set up SSE registers */
571 enable_sse();
572 #endif
573 #if 0 && defined(PAE)
574 /* Enable the PTE no-execute bit. */
575 if ((amd_feature & AMDID_NX) != 0) {
576 uint64_t msr;
577
578 msr = rdmsr(MSR_EFER) | EFER_NXE;
579 wrmsr(MSR_EFER, msr);
580 }
581 #endif
582 #if 0
583 /* A quick check from sanity claus */
584 if (PCPU_GET(apic_id) != lapic_id()) {
585 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
586 printf("SMP: actual apic_id = %d\n", lapic_id());
587 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
588 panic("cpuid mismatch! boom!!");
589 }
590 #endif
591
592 /* Initialize curthread. */
593 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
594 PCPU_SET(curthread, PCPU_GET(idlethread));
595
596 mtx_lock_spin(&ap_boot_mtx);
597 #if 0
598
599 /* Init local apic for irq's */
600 lapic_setup(1);
601 #endif
602 smp_cpus++;
603
604 cpuid = PCPU_GET(cpuid);
605 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
606 printf("SMP: AP CPU #%d Launched!\n", cpuid);
607
608 /* Determine if we are a logical CPU. */
609 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
610 CPU_SET(cpuid, &logical_cpus_mask);
611
612 /* Determine if we are a hyperthread. */
613 if (hyperthreading_cpus > 1 &&
614 PCPU_GET(apic_id) % hyperthreading_cpus != 0)
615 CPU_SET(cpuid, &hyperthreading_cpus_mask);
616 #if 0
617 if (bootverbose)
618 lapic_dump("AP");
619 #endif
620 if (smp_cpus == mp_ncpus) {
621 /* enable IPI's, tlb shootdown, freezes etc */
622 atomic_store_rel_int(&smp_started, 1);
623 smp_active = 1; /* historic */
624 }
625
626 mtx_unlock_spin(&ap_boot_mtx);
627
628 /* wait until all the AP's are up */
629 while (smp_started == 0)
630 ia32_pause();
631
632 PCPU_SET(curthread, PCPU_GET(idlethread));
633
634 /* Start per-CPU event timers. */
635 cpu_initclocks_ap();
636
637 /* enter the scheduler */
638 sched_throw(NULL);
639
640 panic("scheduler returned us to %s", __func__);
641 /* NOTREACHED */
642 }
643
644 /*******************************************************************
645 * local functions and data
646 */
647
648 /*
649 * We tell the I/O APIC code about all the CPUs we want to receive
650 * interrupts. If we don't want certain CPUs to receive IRQs we
651 * can simply not tell the I/O APIC code about them in this function.
652 * We also do not tell it about the BSP since it tells itself about
653 * the BSP internally to work with UP kernels and on UP machines.
654 */
655 static void
656 set_interrupt_apic_ids(void)
657 {
658 u_int i, apic_id;
659
660 for (i = 0; i < MAXCPU; i++) {
661 apic_id = cpu_apic_ids[i];
662 if (apic_id == -1)
663 continue;
664 if (cpu_info[apic_id].cpu_bsp)
665 continue;
666 if (cpu_info[apic_id].cpu_disabled)
667 continue;
668
669 /* Don't let hyperthreads service interrupts. */
670 if (hyperthreading_cpus > 1 &&
671 apic_id % hyperthreading_cpus != 0)
672 continue;
673
674 intr_add_cpu(i);
675 }
676 }
677
678 /*
679 * Assign logical CPU IDs to local APICs.
680 */
681 static void
682 assign_cpu_ids(void)
683 {
684 u_int i;
685
686 /* Check for explicitly disabled CPUs. */
687 for (i = 0; i <= MAX_APIC_ID; i++) {
688 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
689 continue;
690
691 /* Don't use this CPU if it has been disabled by a tunable. */
692 if (resource_disabled("lapic", i)) {
693 cpu_info[i].cpu_disabled = 1;
694 continue;
695 }
696 }
697
698 /*
699 * Assign CPU IDs to local APIC IDs and disable any CPUs
700 * beyond MAXCPU. CPU 0 has already been assigned to the BSP,
701 * so we only have to assign IDs for APs.
702 */
703 mp_ncpus = 1;
704 for (i = 0; i <= MAX_APIC_ID; i++) {
705 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
706 cpu_info[i].cpu_disabled)
707 continue;
708
709 if (mp_ncpus < MAXCPU) {
710 cpu_apic_ids[mp_ncpus] = i;
711 apic_cpuids[i] = mp_ncpus;
712 mp_ncpus++;
713 } else
714 cpu_info[i].cpu_disabled = 1;
715 }
716 KASSERT(mp_maxid >= mp_ncpus - 1,
717 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
718 mp_ncpus));
719 }
720
721 /*
722 * start each AP in our list
723 */
724 /* Lowest 1MB is already mapped: don't touch*/
725 #define TMPMAP_START 1
726 int
727 start_all_aps(void)
728 {
729 int x,apic_id, cpu;
730 struct pcpu *pc;
731
732 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
733
734 /* set up temporary P==V mapping for AP boot */
735 /* XXX this is a hack, we should boot the AP on its own stack/PTD */
736
737 /* start each AP */
738 for (cpu = 1; cpu < mp_ncpus; cpu++) {
739 apic_id = cpu_apic_ids[cpu];
740
741
742 bootAP = cpu;
743 bootAPgdt = gdt + (512*cpu);
744
745 /* Get per-cpu data */
746 pc = &__pcpu[bootAP];
747 pcpu_init(pc, bootAP, sizeof(struct pcpu));
748 dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP);
749 pc->pc_apic_id = cpu_apic_ids[bootAP];
750 pc->pc_prvspace = pc;
751 pc->pc_curthread = 0;
752
753 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
754 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
755
756 PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW);
757 bzero(bootAPgdt, PAGE_SIZE);
758 for (x = 0; x < NGDT; x++)
759 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
760 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
761 #ifdef notyet
762
763 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
764 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
765 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
766 #ifdef CONFIG_ACPI
767 if (acpiid != 0xff)
768 x86_acpiid_to_apicid[acpiid] = apicid;
769 #endif
770 }
771 #endif
772
773 /* attempt to start the Application Processor */
774 if (!start_ap(cpu)) {
775 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
776 /* better panic as the AP may be running loose */
777 printf("panic y/n? [y] ");
778 if (cngetc() != 'n')
779 panic("bye-bye");
780 }
781
782 CPU_SET(cpu, &all_cpus); /* record AP in CPU map */
783 }
784
785
786 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
787
788 /* number of APs actually started */
789 return mp_naps;
790 }
791
792 extern uint8_t *pcpu_boot_stack;
793 extern trap_info_t trap_table[];
794
795 static void
796 smp_trap_init(trap_info_t *trap_ctxt)
797 {
798 const trap_info_t *t = trap_table;
799
800 for (t = trap_table; t->address; t++) {
801 trap_ctxt[t->vector].flags = t->flags;
802 trap_ctxt[t->vector].cs = t->cs;
803 trap_ctxt[t->vector].address = t->address;
804 }
805 }
806
807 extern int nkpt;
808 static void
809 cpu_initialize_context(unsigned int cpu)
810 {
811 /* vcpu_guest_context_t is too large to allocate on the stack.
812 * Hence we allocate statically and protect it with a lock */
813 vm_page_t m[4];
814 static vcpu_guest_context_t ctxt;
815 vm_offset_t boot_stack;
816 vm_offset_t newPTD;
817 vm_paddr_t ma[NPGPTD];
818 static int color;
819 int i;
820
821 /*
822 * Page 0,[0-3] PTD
823 * Page 1, [4] boot stack
824 * Page [5] PDPT
825 *
826 */
827 for (i = 0; i < NPGPTD + 2; i++) {
828 m[i] = vm_page_alloc(NULL, color++,
829 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
830 VM_ALLOC_ZERO);
831
832 pmap_zero_page(m[i]);
833
834 }
835 boot_stack = kmem_alloc_nofault(kernel_map, 1);
836 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
837 ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V;
838
839 #ifdef PAE
840 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
841 for (i = 0; i < NPGPTD; i++) {
842 ((vm_paddr_t *)boot_stack)[i] =
843 ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V;
844 }
845 #endif
846
847 /*
848 * Copy cpu0 IdlePTD to new IdlePTD - copying only
849 * kernel mappings
850 */
851 pmap_qenter(newPTD, m, 4);
852
853 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
854 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
855 nkpt*sizeof(vm_paddr_t));
856
857 pmap_qremove(newPTD, 4);
858 kmem_free(kernel_map, newPTD, 4);
859 /*
860 * map actual idle stack to boot_stack
861 */
862 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
863
864
865 xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1]));
866 vm_page_lock_queues();
867 for (i = 0; i < 4; i++) {
868 int pdir = (PTDPTDI + i) / NPDEPG;
869 int curoffset = (PTDPTDI + i) % NPDEPG;
870
871 xen_queue_pt_update((vm_paddr_t)
872 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
873 ma[i]);
874 }
875 PT_UPDATES_FLUSH();
876 vm_page_unlock_queues();
877
878 memset(&ctxt, 0, sizeof(ctxt));
879 ctxt.flags = VGCF_IN_KERNEL;
880 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
881 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
882 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
883 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
884 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
885 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
886 ctxt.user_regs.eip = (unsigned long)init_secondary;
887 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
888
889 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
890
891 smp_trap_init(ctxt.trap_ctxt);
892
893 ctxt.ldt_ents = 0;
894 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
895 ctxt.gdt_ents = 512;
896
897 #ifdef __i386__
898 ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
899
900 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
901 ctxt.kernel_sp = boot_stack + PAGE_SIZE;
902
903 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
904 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback;
905 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
906 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
907
908 ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]);
909 #else /* __x86_64__ */
910 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
911 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
912 ctxt.kernel_sp = idle->thread.rsp0;
913
914 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
915 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
916 ctxt.syscall_callback_eip = (unsigned long)system_call;
917
918 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
919
920 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
921 #endif
922
923 printf("gdtpfn=%lx pdptpfn=%lx\n",
924 ctxt.gdt_frames[0],
925 ctxt.ctrlreg[3] >> PAGE_SHIFT);
926
927 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
928 DELAY(3000);
929 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
930 }
931
932 /*
933 * This function starts the AP (application processor) identified
934 * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
935 * to accomplish this. This is necessary because of the nuances
936 * of the different hardware we might encounter. It isn't pretty,
937 * but it seems to work.
938 */
939
940 int cpus;
941 static int
942 start_ap(int apic_id)
943 {
944 int ms;
945
946 /* used as a watchpoint to signal AP startup */
947 cpus = mp_naps;
948
949 cpu_initialize_context(apic_id);
950
951 /* Wait up to 5 seconds for it to start. */
952 for (ms = 0; ms < 5000; ms++) {
953 if (mp_naps > cpus)
954 return 1; /* return SUCCESS */
955 DELAY(1000);
956 }
957 return 0; /* return FAILURE */
958 }
959
960 /*
961 * send an IPI to a specific CPU.
962 */
963 static void
964 ipi_send_cpu(int cpu, u_int ipi)
965 {
966 u_int bitmap, old_pending, new_pending;
967
968 if (IPI_IS_BITMAPED(ipi)) {
969 bitmap = 1 << ipi;
970 ipi = IPI_BITMAP_VECTOR;
971 do {
972 old_pending = cpu_ipi_pending[cpu];
973 new_pending = old_pending | bitmap;
974 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
975 old_pending, new_pending));
976 if (!old_pending)
977 ipi_pcpu(cpu, RESCHEDULE_VECTOR);
978 } else {
979 KASSERT(call_data != NULL, ("call_data not set"));
980 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
981 }
982 }
983
984 /*
985 * Flush the TLB on all other CPU's
986 */
987 static void
988 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
989 {
990 u_int ncpu;
991 struct _call_data data;
992
993 ncpu = mp_ncpus - 1; /* does not shootdown self */
994 if (ncpu < 1)
995 return; /* no other cpus */
996 if (!(read_eflags() & PSL_I))
997 panic("%s: interrupts disabled", __func__);
998 mtx_lock_spin(&smp_ipi_mtx);
999 KASSERT(call_data == NULL, ("call_data isn't null?!"));
1000 call_data = &data;
1001 call_data->func_id = vector;
1002 call_data->arg1 = addr1;
1003 call_data->arg2 = addr2;
1004 atomic_store_rel_int(&smp_tlb_wait, 0);
1005 ipi_all_but_self(vector);
1006 while (smp_tlb_wait < ncpu)
1007 ia32_pause();
1008 call_data = NULL;
1009 mtx_unlock_spin(&smp_ipi_mtx);
1010 }
1011
1012 static void
1013 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1014 {
1015 int cpu, ncpu, othercpus;
1016 struct _call_data data;
1017
1018 othercpus = mp_ncpus - 1;
1019 if (CPU_ISFULLSET(&mask)) {
1020 if (othercpus < 1)
1021 return;
1022 } else {
1023 CPU_CLR(PCPU_GET(cpuid), &mask);
1024 if (CPU_EMPTY(&mask))
1025 return;
1026 }
1027 if (!(read_eflags() & PSL_I))
1028 panic("%s: interrupts disabled", __func__);
1029 mtx_lock_spin(&smp_ipi_mtx);
1030 KASSERT(call_data == NULL, ("call_data isn't null?!"));
1031 call_data = &data;
1032 call_data->func_id = vector;
1033 call_data->arg1 = addr1;
1034 call_data->arg2 = addr2;
1035 atomic_store_rel_int(&smp_tlb_wait, 0);
1036 if (CPU_ISFULLSET(&mask)) {
1037 ncpu = othercpus;
1038 ipi_all_but_self(vector);
1039 } else {
1040 ncpu = 0;
1041 while ((cpu = CPU_FFS(&mask)) != 0) {
1042 cpu--;
1043 CPU_CLR(cpu, &mask);
1044 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu,
1045 vector);
1046 ipi_send_cpu(cpu, vector);
1047 ncpu++;
1048 }
1049 }
1050 while (smp_tlb_wait < ncpu)
1051 ia32_pause();
1052 call_data = NULL;
1053 mtx_unlock_spin(&smp_ipi_mtx);
1054 }
1055
1056 void
1057 smp_cache_flush(void)
1058 {
1059
1060 if (smp_started)
1061 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1062 }
1063
1064 void
1065 smp_invltlb(void)
1066 {
1067
1068 if (smp_started) {
1069 smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1070 }
1071 }
1072
1073 void
1074 smp_invlpg(vm_offset_t addr)
1075 {
1076
1077 if (smp_started) {
1078 smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1079 }
1080 }
1081
1082 void
1083 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1084 {
1085
1086 if (smp_started) {
1087 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1088 }
1089 }
1090
1091 void
1092 smp_masked_invltlb(cpuset_t mask)
1093 {
1094
1095 if (smp_started) {
1096 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1097 }
1098 }
1099
1100 void
1101 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
1102 {
1103
1104 if (smp_started) {
1105 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1106 }
1107 }
1108
1109 void
1110 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
1111 {
1112
1113 if (smp_started) {
1114 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1115 }
1116 }
1117
1118 /*
1119 * send an IPI to a set of cpus.
1120 */
1121 void
1122 ipi_selected(cpuset_t cpus, u_int ipi)
1123 {
1124 int cpu;
1125
1126 /*
1127 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1128 * of help in order to understand what is the source.
1129 * Set the mask of receiving CPUs for this purpose.
1130 */
1131 if (ipi == IPI_STOP_HARD)
1132 CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
1133
1134 while ((cpu = CPU_FFS(&cpus)) != 0) {
1135 cpu--;
1136 CPU_CLR(cpu, &cpus);
1137 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1138 ipi_send_cpu(cpu, ipi);
1139 }
1140 }
1141
1142 /*
1143 * send an IPI to a specific CPU.
1144 */
1145 void
1146 ipi_cpu(int cpu, u_int ipi)
1147 {
1148
1149 /*
1150 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1151 * of help in order to understand what is the source.
1152 * Set the mask of receiving CPUs for this purpose.
1153 */
1154 if (ipi == IPI_STOP_HARD)
1155 CPU_SET_ATOMIC(cpu, &ipi_nmi_pending);
1156
1157 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1158 ipi_send_cpu(cpu, ipi);
1159 }
1160
1161 /*
1162 * send an IPI to all CPUs EXCEPT myself
1163 */
1164 void
1165 ipi_all_but_self(u_int ipi)
1166 {
1167 cpuset_t other_cpus;
1168
1169 /*
1170 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1171 * of help in order to understand what is the source.
1172 * Set the mask of receiving CPUs for this purpose.
1173 */
1174 other_cpus = all_cpus;
1175 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1176 if (ipi == IPI_STOP_HARD)
1177 CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
1178
1179 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1180 ipi_selected(other_cpus, ipi);
1181 }
1182
1183 int
1184 ipi_nmi_handler()
1185 {
1186 u_int cpuid;
1187
1188 /*
1189 * As long as there is not a simple way to know about a NMI's
1190 * source, if the bitmask for the current CPU is present in
1191 * the global pending bitword an IPI_STOP_HARD has been issued
1192 * and should be handled.
1193 */
1194 cpuid = PCPU_GET(cpuid);
1195 if (!CPU_ISSET(cpuid, &ipi_nmi_pending))
1196 return (1);
1197
1198 CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending);
1199 cpustop_handler();
1200 return (0);
1201 }
1202
1203 /*
1204 * Handle an IPI_STOP by saving our current context and spinning until we
1205 * are resumed.
1206 */
1207 void
1208 cpustop_handler(void)
1209 {
1210 int cpu;
1211
1212 cpu = PCPU_GET(cpuid);
1213
1214 savectx(&stoppcbs[cpu]);
1215
1216 /* Indicate that we are stopped */
1217 CPU_SET_ATOMIC(cpu, &stopped_cpus);
1218
1219 /* Wait for restart */
1220 while (!CPU_ISSET(cpu, &started_cpus))
1221 ia32_pause();
1222
1223 CPU_CLR_ATOMIC(cpu, &started_cpus);
1224 CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1225
1226 if (cpu == 0 && cpustop_restartfunc != NULL) {
1227 cpustop_restartfunc();
1228 cpustop_restartfunc = NULL;
1229 }
1230 }
1231
1232 /*
1233 * This is called once the rest of the system is up and running and we're
1234 * ready to let the AP's out of the pen.
1235 */
1236 static void
1237 release_aps(void *dummy __unused)
1238 {
1239
1240 if (mp_ncpus == 1)
1241 return;
1242 atomic_store_rel_int(&aps_ready, 1);
1243 while (smp_started == 0)
1244 ia32_pause();
1245 }
1246 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1247 SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1248
Cache object: 7ec4c100fa73f58d55ed0e706d2f2629
|