1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2008, by Kip Macy
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 * derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: releng/8.3/sys/i386/xen/mp_machdep.c 222700 2011-06-04 22:51:06Z attilio $");
29
30 #include "opt_apic.h"
31 #include "opt_cpu.h"
32 #include "opt_kstack_pages.h"
33 #include "opt_mp_watchdog.h"
34 #include "opt_pmap.h"
35 #include "opt_sched.h"
36 #include "opt_smp.h"
37
38 #if !defined(lint)
39 #if !defined(SMP)
40 #error How did you get here?
41 #endif
42
43 #ifndef DEV_APIC
44 #error The apic device is required for SMP, add "device apic" to your config file.
45 #endif
46 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
47 #error SMP not supported with CPU_DISABLE_CMPXCHG
48 #endif
49 #endif /* not lint */
50
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/bus.h>
54 #include <sys/cons.h> /* cngetc() */
55 #ifdef GPROF
56 #include <sys/gmon.h>
57 #endif
58 #include <sys/kernel.h>
59 #include <sys/ktr.h>
60 #include <sys/lock.h>
61 #include <sys/malloc.h>
62 #include <sys/memrange.h>
63 #include <sys/mutex.h>
64 #include <sys/pcpu.h>
65 #include <sys/proc.h>
66 #include <sys/sched.h>
67 #include <sys/smp.h>
68 #include <sys/sysctl.h>
69
70 #include <vm/vm.h>
71 #include <vm/vm_param.h>
72 #include <vm/pmap.h>
73 #include <vm/vm_kern.h>
74 #include <vm/vm_extern.h>
75 #include <vm/vm_page.h>
76
77 #include <machine/apicreg.h>
78 #include <machine/md_var.h>
79 #include <machine/mp_watchdog.h>
80 #include <machine/pcb.h>
81 #include <machine/psl.h>
82 #include <machine/smp.h>
83 #include <machine/specialreg.h>
84 #include <machine/pcpu.h>
85
86
87
88 #include <machine/xen/xen-os.h>
89 #include <xen/evtchn.h>
90 #include <xen/xen_intr.h>
91 #include <xen/hypervisor.h>
92 #include <xen/interface/vcpu.h>
93
94
95 int mp_naps; /* # of Applications processors */
96 int boot_cpu_id = -1; /* designated BSP */
97
98 extern struct pcpu __pcpu[];
99
100 static int bootAP;
101 static union descriptor *bootAPgdt;
102
103 static char resched_name[NR_CPUS][15];
104 static char callfunc_name[NR_CPUS][15];
105
106 /* Free these after use */
107 void *bootstacks[MAXCPU];
108
109 /* Hotwire a 0->4MB V==P mapping */
110 extern pt_entry_t *KPTphys;
111
112 struct pcb stoppcbs[MAXCPU];
113
114 /* Variables needed for SMP tlb shootdown. */
115 vm_offset_t smp_tlb_addr1;
116 vm_offset_t smp_tlb_addr2;
117 volatile int smp_tlb_wait;
118
119 typedef void call_data_func_t(uintptr_t , uintptr_t);
120
121 static u_int logical_cpus;
122 static volatile cpumask_t ipi_nmi_pending;
123
124 /* used to hold the AP's until we are ready to release them */
125 static struct mtx ap_boot_mtx;
126
127 /* Set to 1 once we're ready to let the APs out of the pen. */
128 static volatile int aps_ready = 0;
129
130 /*
131 * Store data from cpu_add() until later in the boot when we actually setup
132 * the APs.
133 */
134 struct cpu_info {
135 int cpu_present:1;
136 int cpu_bsp:1;
137 int cpu_disabled:1;
138 } static cpu_info[MAX_APIC_ID + 1];
139 int cpu_apic_ids[MAXCPU];
140 int apic_cpuids[MAX_APIC_ID + 1];
141
142 /* Holds pending bitmap based IPIs per CPU */
143 static volatile u_int cpu_ipi_pending[MAXCPU];
144
145 static int cpu_logical;
146 static int cpu_cores;
147
148 static void assign_cpu_ids(void);
149 static void set_interrupt_apic_ids(void);
150 int start_all_aps(void);
151 static int start_ap(int apic_id);
152 static void release_aps(void *dummy);
153
154 static u_int hyperthreading_cpus;
155 static cpumask_t hyperthreading_cpus_mask;
156
157 extern void Xhypervisor_callback(void);
158 extern void failsafe_callback(void);
159 extern void pmap_lazyfix_action(void);
160
161 struct cpu_group *
162 cpu_topo(void)
163 {
164 if (cpu_cores == 0)
165 cpu_cores = 1;
166 if (cpu_logical == 0)
167 cpu_logical = 1;
168 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
169 printf("WARNING: Non-uniform processors.\n");
170 printf("WARNING: Using suboptimal topology.\n");
171 return (smp_topo_none());
172 }
173 /*
174 * No multi-core or hyper-threaded.
175 */
176 if (cpu_logical * cpu_cores == 1)
177 return (smp_topo_none());
178 /*
179 * Only HTT no multi-core.
180 */
181 if (cpu_logical > 1 && cpu_cores == 1)
182 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
183 /*
184 * Only multi-core no HTT.
185 */
186 if (cpu_cores > 1 && cpu_logical == 1)
187 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
188 /*
189 * Both HTT and multi-core.
190 */
191 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
192 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
193 }
194
195 /*
196 * Calculate usable address in base memory for AP trampoline code.
197 */
198 u_int
199 mp_bootaddress(u_int basemem)
200 {
201
202 return (basemem);
203 }
204
205 void
206 cpu_add(u_int apic_id, char boot_cpu)
207 {
208
209 if (apic_id > MAX_APIC_ID) {
210 panic("SMP: APIC ID %d too high", apic_id);
211 return;
212 }
213 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
214 apic_id));
215 cpu_info[apic_id].cpu_present = 1;
216 if (boot_cpu) {
217 KASSERT(boot_cpu_id == -1,
218 ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
219 boot_cpu_id));
220 boot_cpu_id = apic_id;
221 cpu_info[apic_id].cpu_bsp = 1;
222 }
223 if (mp_ncpus < MAXCPU)
224 mp_ncpus++;
225 if (bootverbose)
226 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
227 "AP");
228 }
229
230 void
231 cpu_mp_setmaxid(void)
232 {
233
234 mp_maxid = MAXCPU - 1;
235 }
236
237 int
238 cpu_mp_probe(void)
239 {
240
241 /*
242 * Always record BSP in CPU map so that the mbuf init code works
243 * correctly.
244 */
245 all_cpus = 1;
246 if (mp_ncpus == 0) {
247 /*
248 * No CPUs were found, so this must be a UP system. Setup
249 * the variables to represent a system with a single CPU
250 * with an id of 0.
251 */
252 mp_ncpus = 1;
253 return (0);
254 }
255
256 /* At least one CPU was found. */
257 if (mp_ncpus == 1) {
258 /*
259 * One CPU was found, so this must be a UP system with
260 * an I/O APIC.
261 */
262 return (0);
263 }
264
265 /* At least two CPUs were found. */
266 return (1);
267 }
268
269 /*
270 * Initialize the IPI handlers and start up the AP's.
271 */
272 void
273 cpu_mp_start(void)
274 {
275 int i;
276
277 /* Initialize the logical ID to APIC ID table. */
278 for (i = 0; i < MAXCPU; i++) {
279 cpu_apic_ids[i] = -1;
280 cpu_ipi_pending[i] = 0;
281 }
282
283 /* Set boot_cpu_id if needed. */
284 if (boot_cpu_id == -1) {
285 boot_cpu_id = PCPU_GET(apic_id);
286 cpu_info[boot_cpu_id].cpu_bsp = 1;
287 } else
288 KASSERT(boot_cpu_id == PCPU_GET(apic_id),
289 ("BSP's APIC ID doesn't match boot_cpu_id"));
290 cpu_apic_ids[0] = boot_cpu_id;
291 apic_cpuids[boot_cpu_id] = 0;
292
293 assign_cpu_ids();
294
295 /* Start each Application Processor */
296 start_all_aps();
297
298 /* Setup the initial logical CPUs info. */
299 logical_cpus = logical_cpus_mask = 0;
300 if (cpu_feature & CPUID_HTT)
301 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
302
303 set_interrupt_apic_ids();
304 }
305
306
307 static void
308 iv_rendezvous(uintptr_t a, uintptr_t b)
309 {
310 smp_rendezvous_action();
311 }
312
313 static void
314 iv_invltlb(uintptr_t a, uintptr_t b)
315 {
316 xen_tlb_flush();
317 }
318
319 static void
320 iv_invlpg(uintptr_t a, uintptr_t b)
321 {
322 xen_invlpg(a);
323 }
324
325 static void
326 iv_invlrng(uintptr_t a, uintptr_t b)
327 {
328 vm_offset_t start = (vm_offset_t)a;
329 vm_offset_t end = (vm_offset_t)b;
330
331 while (start < end) {
332 xen_invlpg(start);
333 start += PAGE_SIZE;
334 }
335 }
336
337
338 static void
339 iv_invlcache(uintptr_t a, uintptr_t b)
340 {
341
342 wbinvd();
343 atomic_add_int(&smp_tlb_wait, 1);
344 }
345
346 static void
347 iv_lazypmap(uintptr_t a, uintptr_t b)
348 {
349 pmap_lazyfix_action();
350 atomic_add_int(&smp_tlb_wait, 1);
351 }
352
353 /*
354 * These start from "IPI offset" APIC_IPI_INTS
355 */
356 static call_data_func_t *ipi_vectors[6] =
357 {
358 iv_rendezvous,
359 iv_invltlb,
360 iv_invlpg,
361 iv_invlrng,
362 iv_invlcache,
363 iv_lazypmap,
364 };
365
366 /*
367 * Reschedule call back. Nothing to do,
368 * all the work is done automatically when
369 * we return from the interrupt.
370 */
371 static int
372 smp_reschedule_interrupt(void *unused)
373 {
374 int cpu = PCPU_GET(cpuid);
375 u_int ipi_bitmap;
376
377 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
378
379 if (ipi_bitmap & (1 << IPI_PREEMPT)) {
380 #ifdef COUNT_IPIS
381 (*ipi_preempt_counts[cpu])++;
382 #endif
383 sched_preempt(curthread);
384 }
385
386 if (ipi_bitmap & (1 << IPI_AST)) {
387 #ifdef COUNT_IPIS
388 (*ipi_ast_counts[cpu])++;
389 #endif
390 /* Nothing to do for AST */
391 }
392 return (FILTER_HANDLED);
393 }
394
395 struct _call_data {
396 uint16_t func_id;
397 uint16_t wait;
398 uintptr_t arg1;
399 uintptr_t arg2;
400 atomic_t started;
401 atomic_t finished;
402 };
403
404 static struct _call_data *call_data;
405
406 static int
407 smp_call_function_interrupt(void *unused)
408 {
409 call_data_func_t *func;
410 uintptr_t arg1 = call_data->arg1;
411 uintptr_t arg2 = call_data->arg2;
412 int wait = call_data->wait;
413 atomic_t *started = &call_data->started;
414 atomic_t *finished = &call_data->finished;
415
416 /* We only handle function IPIs, not bitmap IPIs */
417 if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR)
418 panic("invalid function id %u", call_data->func_id);
419
420 func = ipi_vectors[call_data->func_id - APIC_IPI_INTS];
421 /*
422 * Notify initiating CPU that I've grabbed the data and am
423 * about to execute the function
424 */
425 mb();
426 atomic_inc(started);
427 /*
428 * At this point the info structure may be out of scope unless wait==1
429 */
430 (*func)(arg1, arg2);
431
432 if (wait) {
433 mb();
434 atomic_inc(finished);
435 }
436 atomic_add_int(&smp_tlb_wait, 1);
437 return (FILTER_HANDLED);
438 }
439
440 /*
441 * Print various information about the SMP system hardware and setup.
442 */
443 void
444 cpu_mp_announce(void)
445 {
446 int i, x;
447
448 /* List CPUs */
449 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
450 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
451 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
452 continue;
453 if (cpu_info[x].cpu_disabled)
454 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x);
455 else {
456 KASSERT(i < mp_ncpus,
457 ("mp_ncpus and actual cpus are out of whack"));
458 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
459 }
460 }
461 }
462
463 static int
464 xen_smp_intr_init(unsigned int cpu)
465 {
466 int rc;
467 unsigned int irq;
468
469 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
470
471 sprintf(resched_name[cpu], "resched%u", cpu);
472 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
473 cpu,
474 resched_name[cpu],
475 smp_reschedule_interrupt,
476 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
477
478 printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n",
479 cpu, irq, RESCHEDULE_VECTOR);
480
481 per_cpu(resched_irq, cpu) = irq;
482
483 sprintf(callfunc_name[cpu], "callfunc%u", cpu);
484 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
485 cpu,
486 callfunc_name[cpu],
487 smp_call_function_interrupt,
488 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
489 if (rc < 0)
490 goto fail;
491 per_cpu(callfunc_irq, cpu) = irq;
492
493 printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n",
494 cpu, irq, CALL_FUNCTION_VECTOR);
495
496
497 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
498 goto fail;
499
500 return 0;
501
502 fail:
503 if (per_cpu(resched_irq, cpu) >= 0)
504 unbind_from_irqhandler(per_cpu(resched_irq, cpu));
505 if (per_cpu(callfunc_irq, cpu) >= 0)
506 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
507 return rc;
508 }
509
510 static void
511 xen_smp_intr_init_cpus(void *unused)
512 {
513 int i;
514
515 for (i = 0; i < mp_ncpus; i++)
516 xen_smp_intr_init(i);
517 }
518
519 #define MTOPSIZE (1<<(14 + PAGE_SHIFT))
520
521 /*
522 * AP CPU's call this to initialize themselves.
523 */
524 void
525 init_secondary(void)
526 {
527 vm_offset_t addr;
528 int gsel_tss;
529
530
531 /* bootAP is set in start_ap() to our ID. */
532 PCPU_SET(currentldt, _default_ldt);
533 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
534 #if 0
535 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
536 #endif
537 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
538 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
539 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
540 #if 0
541 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
542
543 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
544 #endif
545 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
546
547 /*
548 * Set to a known state:
549 * Set by mpboot.s: CR0_PG, CR0_PE
550 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
551 */
552 /*
553 * signal our startup to the BSP.
554 */
555 mp_naps++;
556
557 /* Spin until the BSP releases the AP's. */
558 while (!aps_ready)
559 ia32_pause();
560
561 /* BSP may have changed PTD while we were waiting */
562 invltlb();
563 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
564 invlpg(addr);
565
566 /* set up FPU state on the AP */
567 npxinit();
568 #if 0
569
570 /* set up SSE registers */
571 enable_sse();
572 #endif
573 #if 0 && defined(PAE)
574 /* Enable the PTE no-execute bit. */
575 if ((amd_feature & AMDID_NX) != 0) {
576 uint64_t msr;
577
578 msr = rdmsr(MSR_EFER) | EFER_NXE;
579 wrmsr(MSR_EFER, msr);
580 }
581 #endif
582 #if 0
583 /* A quick check from sanity claus */
584 if (PCPU_GET(apic_id) != lapic_id()) {
585 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
586 printf("SMP: actual apic_id = %d\n", lapic_id());
587 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
588 panic("cpuid mismatch! boom!!");
589 }
590 #endif
591
592 /* Initialize curthread. */
593 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
594 PCPU_SET(curthread, PCPU_GET(idlethread));
595
596 mtx_lock_spin(&ap_boot_mtx);
597 #if 0
598
599 /* Init local apic for irq's */
600 lapic_setup(1);
601 #endif
602 smp_cpus++;
603
604 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
605 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
606
607 /* Determine if we are a logical CPU. */
608 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
609 logical_cpus_mask |= PCPU_GET(cpumask);
610
611 /* Determine if we are a hyperthread. */
612 if (hyperthreading_cpus > 1 &&
613 PCPU_GET(apic_id) % hyperthreading_cpus != 0)
614 hyperthreading_cpus_mask |= PCPU_GET(cpumask);
615
616 /* Build our map of 'other' CPUs. */
617 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
618 #if 0
619 if (bootverbose)
620 lapic_dump("AP");
621 #endif
622 if (smp_cpus == mp_ncpus) {
623 /* enable IPI's, tlb shootdown, freezes etc */
624 atomic_store_rel_int(&smp_started, 1);
625 smp_active = 1; /* historic */
626 }
627
628 mtx_unlock_spin(&ap_boot_mtx);
629
630 /* wait until all the AP's are up */
631 while (smp_started == 0)
632 ia32_pause();
633
634
635 PCPU_SET(curthread, PCPU_GET(idlethread));
636 /* enter the scheduler */
637 sched_throw(NULL);
638
639 panic("scheduler returned us to %s", __func__);
640 /* NOTREACHED */
641 }
642
643 /*******************************************************************
644 * local functions and data
645 */
646
647 /*
648 * We tell the I/O APIC code about all the CPUs we want to receive
649 * interrupts. If we don't want certain CPUs to receive IRQs we
650 * can simply not tell the I/O APIC code about them in this function.
651 * We also do not tell it about the BSP since it tells itself about
652 * the BSP internally to work with UP kernels and on UP machines.
653 */
654 static void
655 set_interrupt_apic_ids(void)
656 {
657 u_int i, apic_id;
658
659 for (i = 0; i < MAXCPU; i++) {
660 apic_id = cpu_apic_ids[i];
661 if (apic_id == -1)
662 continue;
663 if (cpu_info[apic_id].cpu_bsp)
664 continue;
665 if (cpu_info[apic_id].cpu_disabled)
666 continue;
667
668 /* Don't let hyperthreads service interrupts. */
669 if (hyperthreading_cpus > 1 &&
670 apic_id % hyperthreading_cpus != 0)
671 continue;
672
673 intr_add_cpu(i);
674 }
675 }
676
677 /*
678 * Assign logical CPU IDs to local APICs.
679 */
680 static void
681 assign_cpu_ids(void)
682 {
683 u_int i;
684
685 /* Check for explicitly disabled CPUs. */
686 for (i = 0; i <= MAX_APIC_ID; i++) {
687 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
688 continue;
689
690 /* Don't use this CPU if it has been disabled by a tunable. */
691 if (resource_disabled("lapic", i)) {
692 cpu_info[i].cpu_disabled = 1;
693 continue;
694 }
695 }
696
697 /*
698 * Assign CPU IDs to local APIC IDs and disable any CPUs
699 * beyond MAXCPU. CPU 0 has already been assigned to the BSP,
700 * so we only have to assign IDs for APs.
701 */
702 mp_ncpus = 1;
703 for (i = 0; i <= MAX_APIC_ID; i++) {
704 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
705 cpu_info[i].cpu_disabled)
706 continue;
707
708 if (mp_ncpus < MAXCPU) {
709 cpu_apic_ids[mp_ncpus] = i;
710 apic_cpuids[i] = mp_ncpus;
711 mp_ncpus++;
712 } else
713 cpu_info[i].cpu_disabled = 1;
714 }
715 KASSERT(mp_maxid >= mp_ncpus - 1,
716 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
717 mp_ncpus));
718 }
719
720 /*
721 * start each AP in our list
722 */
723 /* Lowest 1MB is already mapped: don't touch*/
724 #define TMPMAP_START 1
725 int
726 start_all_aps(void)
727 {
728 int x,apic_id, cpu;
729 struct pcpu *pc;
730
731 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
732
733 /* set up temporary P==V mapping for AP boot */
734 /* XXX this is a hack, we should boot the AP on its own stack/PTD */
735
736 /* start each AP */
737 for (cpu = 1; cpu < mp_ncpus; cpu++) {
738 apic_id = cpu_apic_ids[cpu];
739
740
741 bootAP = cpu;
742 bootAPgdt = gdt + (512*cpu);
743
744 /* Get per-cpu data */
745 pc = &__pcpu[bootAP];
746 pcpu_init(pc, bootAP, sizeof(struct pcpu));
747 dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP);
748 pc->pc_apic_id = cpu_apic_ids[bootAP];
749 pc->pc_prvspace = pc;
750 pc->pc_curthread = 0;
751
752 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
753 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
754
755 PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW);
756 bzero(bootAPgdt, PAGE_SIZE);
757 for (x = 0; x < NGDT; x++)
758 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
759 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
760 #ifdef notyet
761
762 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
763 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
764 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
765 #ifdef CONFIG_ACPI
766 if (acpiid != 0xff)
767 x86_acpiid_to_apicid[acpiid] = apicid;
768 #endif
769 }
770 #endif
771
772 /* attempt to start the Application Processor */
773 if (!start_ap(cpu)) {
774 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
775 /* better panic as the AP may be running loose */
776 printf("panic y/n? [y] ");
777 if (cngetc() != 'n')
778 panic("bye-bye");
779 }
780
781 all_cpus |= (1 << cpu); /* record AP in CPU map */
782 }
783
784
785 /* build our map of 'other' CPUs */
786 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
787
788 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
789
790 /* number of APs actually started */
791 return mp_naps;
792 }
793
794 extern uint8_t *pcpu_boot_stack;
795 extern trap_info_t trap_table[];
796
797 static void
798 smp_trap_init(trap_info_t *trap_ctxt)
799 {
800 const trap_info_t *t = trap_table;
801
802 for (t = trap_table; t->address; t++) {
803 trap_ctxt[t->vector].flags = t->flags;
804 trap_ctxt[t->vector].cs = t->cs;
805 trap_ctxt[t->vector].address = t->address;
806 }
807 }
808
809 extern int nkpt;
810 static void
811 cpu_initialize_context(unsigned int cpu)
812 {
813 /* vcpu_guest_context_t is too large to allocate on the stack.
814 * Hence we allocate statically and protect it with a lock */
815 vm_page_t m[4];
816 static vcpu_guest_context_t ctxt;
817 vm_offset_t boot_stack;
818 vm_offset_t newPTD;
819 vm_paddr_t ma[NPGPTD];
820 static int color;
821 int i;
822
823 /*
824 * Page 0,[0-3] PTD
825 * Page 1, [4] boot stack
826 * Page [5] PDPT
827 *
828 */
829 for (i = 0; i < NPGPTD + 2; i++) {
830 m[i] = vm_page_alloc(NULL, color++,
831 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
832 VM_ALLOC_ZERO);
833
834 pmap_zero_page(m[i]);
835
836 }
837 boot_stack = kmem_alloc_nofault(kernel_map, 1);
838 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
839 ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V;
840
841 #ifdef PAE
842 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
843 for (i = 0; i < NPGPTD; i++) {
844 ((vm_paddr_t *)boot_stack)[i] =
845 ma[i] =
846 xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V;
847 }
848 #endif
849
850 /*
851 * Copy cpu0 IdlePTD to new IdlePTD - copying only
852 * kernel mappings
853 */
854 pmap_qenter(newPTD, m, 4);
855
856 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
857 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
858 nkpt*sizeof(vm_paddr_t));
859
860 pmap_qremove(newPTD, 4);
861 kmem_free(kernel_map, newPTD, 4);
862 /*
863 * map actual idle stack to boot_stack
864 */
865 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
866
867
868 xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])));
869 vm_page_lock_queues();
870 for (i = 0; i < 4; i++) {
871 int pdir = (PTDPTDI + i) / NPDEPG;
872 int curoffset = (PTDPTDI + i) % NPDEPG;
873
874 xen_queue_pt_update((vm_paddr_t)
875 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
876 ma[i]);
877 }
878 PT_UPDATES_FLUSH();
879 vm_page_unlock_queues();
880
881 memset(&ctxt, 0, sizeof(ctxt));
882 ctxt.flags = VGCF_IN_KERNEL;
883 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
884 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
885 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
886 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
887 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
888 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
889 ctxt.user_regs.eip = (unsigned long)init_secondary;
890 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
891
892 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
893
894 smp_trap_init(ctxt.trap_ctxt);
895
896 ctxt.ldt_ents = 0;
897 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
898 ctxt.gdt_ents = 512;
899
900 #ifdef __i386__
901 ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
902
903 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
904 ctxt.kernel_sp = boot_stack + PAGE_SIZE;
905
906 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
907 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback;
908 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
909 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
910
911 ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
912 #else /* __x86_64__ */
913 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
914 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
915 ctxt.kernel_sp = idle->thread.rsp0;
916
917 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
918 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
919 ctxt.syscall_callback_eip = (unsigned long)system_call;
920
921 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
922
923 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
924 #endif
925
926 printf("gdtpfn=%lx pdptpfn=%lx\n",
927 ctxt.gdt_frames[0],
928 ctxt.ctrlreg[3] >> PAGE_SHIFT);
929
930 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
931 DELAY(3000);
932 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
933 }
934
935 /*
936 * This function starts the AP (application processor) identified
937 * by the APIC ID 'physicalCpu'. It does quite a "song and dance"
938 * to accomplish this. This is necessary because of the nuances
939 * of the different hardware we might encounter. It isn't pretty,
940 * but it seems to work.
941 */
942
943 int cpus;
944 static int
945 start_ap(int apic_id)
946 {
947 int ms;
948
949 /* used as a watchpoint to signal AP startup */
950 cpus = mp_naps;
951
952 cpu_initialize_context(apic_id);
953
954 /* Wait up to 5 seconds for it to start. */
955 for (ms = 0; ms < 5000; ms++) {
956 if (mp_naps > cpus)
957 return 1; /* return SUCCESS */
958 DELAY(1000);
959 }
960 return 0; /* return FAILURE */
961 }
962
963 /*
964 * send an IPI to a specific CPU.
965 */
966 static void
967 ipi_send_cpu(int cpu, u_int ipi)
968 {
969 u_int bitmap, old_pending, new_pending;
970
971 if (IPI_IS_BITMAPED(ipi)) {
972 bitmap = 1 << ipi;
973 ipi = IPI_BITMAP_VECTOR;
974 do {
975 old_pending = cpu_ipi_pending[cpu];
976 new_pending = old_pending | bitmap;
977 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
978 old_pending, new_pending));
979 if (!old_pending)
980 ipi_pcpu(cpu, RESCHEDULE_VECTOR);
981 } else {
982 KASSERT(call_data != NULL, ("call_data not set"));
983 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
984 }
985 }
986
987 /*
988 * Flush the TLB on all other CPU's
989 */
990 static void
991 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
992 {
993 u_int ncpu;
994 struct _call_data data;
995
996 ncpu = mp_ncpus - 1; /* does not shootdown self */
997 if (ncpu < 1)
998 return; /* no other cpus */
999 if (!(read_eflags() & PSL_I))
1000 panic("%s: interrupts disabled", __func__);
1001 mtx_lock_spin(&smp_ipi_mtx);
1002 KASSERT(call_data == NULL, ("call_data isn't null?!"));
1003 call_data = &data;
1004 call_data->func_id = vector;
1005 call_data->arg1 = addr1;
1006 call_data->arg2 = addr2;
1007 atomic_store_rel_int(&smp_tlb_wait, 0);
1008 ipi_all_but_self(vector);
1009 while (smp_tlb_wait < ncpu)
1010 ia32_pause();
1011 call_data = NULL;
1012 mtx_unlock_spin(&smp_ipi_mtx);
1013 }
1014
1015 static void
1016 smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1017 {
1018 int ncpu, othercpus;
1019 struct _call_data data;
1020
1021 othercpus = mp_ncpus - 1;
1022 if (mask == (u_int)-1) {
1023 ncpu = othercpus;
1024 if (ncpu < 1)
1025 return;
1026 } else {
1027 mask &= ~PCPU_GET(cpumask);
1028 if (mask == 0)
1029 return;
1030 ncpu = bitcount32(mask);
1031 if (ncpu > othercpus) {
1032 /* XXX this should be a panic offence */
1033 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1034 ncpu, othercpus);
1035 ncpu = othercpus;
1036 }
1037 /* XXX should be a panic, implied by mask == 0 above */
1038 if (ncpu < 1)
1039 return;
1040 }
1041 if (!(read_eflags() & PSL_I))
1042 panic("%s: interrupts disabled", __func__);
1043 mtx_lock_spin(&smp_ipi_mtx);
1044 KASSERT(call_data == NULL, ("call_data isn't null?!"));
1045 call_data = &data;
1046 call_data->func_id = vector;
1047 call_data->arg1 = addr1;
1048 call_data->arg2 = addr2;
1049 atomic_store_rel_int(&smp_tlb_wait, 0);
1050 if (mask == (u_int)-1)
1051 ipi_all_but_self(vector);
1052 else
1053 ipi_selected(mask, vector);
1054 while (smp_tlb_wait < ncpu)
1055 ia32_pause();
1056 call_data = NULL;
1057 mtx_unlock_spin(&smp_ipi_mtx);
1058 }
1059
1060 void
1061 smp_cache_flush(void)
1062 {
1063
1064 if (smp_started)
1065 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1066 }
1067
1068 void
1069 smp_invltlb(void)
1070 {
1071
1072 if (smp_started) {
1073 smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1074 }
1075 }
1076
1077 void
1078 smp_invlpg(vm_offset_t addr)
1079 {
1080
1081 if (smp_started) {
1082 smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1083 }
1084 }
1085
1086 void
1087 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1088 {
1089
1090 if (smp_started) {
1091 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1092 }
1093 }
1094
1095 void
1096 smp_masked_invltlb(cpumask_t mask)
1097 {
1098
1099 if (smp_started) {
1100 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1101 }
1102 }
1103
1104 void
1105 smp_masked_invlpg(cpumask_t mask, vm_offset_t addr)
1106 {
1107
1108 if (smp_started) {
1109 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1110 }
1111 }
1112
1113 void
1114 smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2)
1115 {
1116
1117 if (smp_started) {
1118 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1119 }
1120 }
1121
1122 /*
1123 * send an IPI to a set of cpus.
1124 */
1125 void
1126 ipi_selected(cpumask_t cpus, u_int ipi)
1127 {
1128 int cpu;
1129
1130 /*
1131 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1132 * of help in order to understand what is the source.
1133 * Set the mask of receiving CPUs for this purpose.
1134 */
1135 if (ipi == IPI_STOP_HARD)
1136 atomic_set_int(&ipi_nmi_pending, cpus);
1137
1138 while ((cpu = ffs(cpus)) != 0) {
1139 cpu--;
1140 cpus &= ~(1 << cpu);
1141 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1142 ipi_send_cpu(cpu, ipi);
1143 }
1144 }
1145
1146 /*
1147 * send an IPI to a specific CPU.
1148 */
1149 void
1150 ipi_cpu(int cpu, u_int ipi)
1151 {
1152
1153 /*
1154 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1155 * of help in order to understand what is the source.
1156 * Set the mask of receiving CPUs for this purpose.
1157 */
1158 if (ipi == IPI_STOP_HARD)
1159 atomic_set_int(&ipi_nmi_pending, 1 << cpu);
1160
1161 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1162 ipi_send_cpu(cpu, ipi);
1163 }
1164
1165 /*
1166 * send an IPI to all CPUs EXCEPT myself
1167 */
1168 void
1169 ipi_all_but_self(u_int ipi)
1170 {
1171
1172 /*
1173 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1174 * of help in order to understand what is the source.
1175 * Set the mask of receiving CPUs for this purpose.
1176 */
1177 if (ipi == IPI_STOP_HARD)
1178 atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus));
1179
1180 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1181 ipi_selected(PCPU_GET(other_cpus), ipi);
1182 }
1183
1184 int
1185 ipi_nmi_handler()
1186 {
1187 cpumask_t cpumask;
1188
1189 /*
1190 * As long as there is not a simple way to know about a NMI's
1191 * source, if the bitmask for the current CPU is present in
1192 * the global pending bitword an IPI_STOP_HARD has been issued
1193 * and should be handled.
1194 */
1195 cpumask = PCPU_GET(cpumask);
1196 if ((ipi_nmi_pending & cpumask) == 0)
1197 return (1);
1198
1199 atomic_clear_int(&ipi_nmi_pending, cpumask);
1200 cpustop_handler();
1201 return (0);
1202 }
1203
1204 /*
1205 * Handle an IPI_STOP by saving our current context and spinning until we
1206 * are resumed.
1207 */
1208 void
1209 cpustop_handler(void)
1210 {
1211 int cpu = PCPU_GET(cpuid);
1212 int cpumask = PCPU_GET(cpumask);
1213
1214 savectx(&stoppcbs[cpu]);
1215
1216 /* Indicate that we are stopped */
1217 atomic_set_int(&stopped_cpus, cpumask);
1218
1219 /* Wait for restart */
1220 while (!(started_cpus & cpumask))
1221 ia32_pause();
1222
1223 atomic_clear_int(&started_cpus, cpumask);
1224 atomic_clear_int(&stopped_cpus, cpumask);
1225
1226 if (cpu == 0 && cpustop_restartfunc != NULL) {
1227 cpustop_restartfunc();
1228 cpustop_restartfunc = NULL;
1229 }
1230 }
1231
1232 /*
1233 * This is called once the rest of the system is up and running and we're
1234 * ready to let the AP's out of the pen.
1235 */
1236 static void
1237 release_aps(void *dummy __unused)
1238 {
1239
1240 if (mp_ncpus == 1)
1241 return;
1242 atomic_store_rel_int(&aps_ready, 1);
1243 while (smp_started == 0)
1244 ia32_pause();
1245 }
1246 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1247 SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1248
Cache object: 4b2c13ca25f9ca9aeb9e783edd84e9dd
|