The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/x86/x86/mp_x86.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1996, by Steve Passe
    3  * Copyright (c) 2003, by Peter Wemm
    4  * All rights reserved.
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. The name of the developer may NOT be used to endorse or promote products
   12  *    derived from this software without specific prior written permission.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD$");
   29 
   30 #ifdef __i386__
   31 #include "opt_apic.h"
   32 #endif
   33 #include "opt_cpu.h"
   34 #include "opt_kstack_pages.h"
   35 #include "opt_pmap.h"
   36 #include "opt_sched.h"
   37 #include "opt_smp.h"
   38 
   39 #include <sys/param.h>
   40 #include <sys/systm.h>
   41 #include <sys/bus.h>
   42 #include <sys/cons.h>   /* cngetc() */
   43 #include <sys/cpuset.h>
   44 #ifdef GPROF 
   45 #include <sys/gmon.h>
   46 #endif
   47 #include <sys/kernel.h>
   48 #include <sys/ktr.h>
   49 #include <sys/lock.h>
   50 #include <sys/malloc.h>
   51 #include <sys/memrange.h>
   52 #include <sys/mutex.h>
   53 #include <sys/pcpu.h>
   54 #include <sys/proc.h>
   55 #include <sys/sched.h>
   56 #include <sys/smp.h>
   57 #include <sys/sysctl.h>
   58 
   59 #include <vm/vm.h>
   60 #include <vm/vm_param.h>
   61 #include <vm/pmap.h>
   62 #include <vm/vm_kern.h>
   63 #include <vm/vm_extern.h>
   64 #include <vm/vm_map.h>
   65 
   66 #include <x86/apicreg.h>
   67 #include <machine/clock.h>
   68 #include <machine/cpu.h>
   69 #include <machine/cputypes.h>
   70 #include <x86/mca.h>
   71 #include <machine/md_var.h>
   72 #include <machine/pcb.h>
   73 #include <machine/psl.h>
   74 #include <machine/smp.h>
   75 #include <machine/specialreg.h>
   76 #include <x86/ucode.h>
   77 
   78 /* lock region used by kernel profiling */
   79 int     mcount_lock;
   80 
   81 int     mp_naps;                /* # of Applications processors */
   82 int     boot_cpu_id = -1;       /* designated BSP */
   83 
   84 extern  struct pcpu __pcpu[];
   85 
   86 /* AP uses this during bootstrap.  Do not staticize.  */
   87 char *bootSTK;
   88 int bootAP;
   89 
   90 /* Free these after use */
   91 void *bootstacks[MAXCPU];
   92 void *dpcpu;
   93 
   94 struct pcb stoppcbs[MAXCPU];
   95 struct susppcb **susppcbs;
   96 
   97 #ifdef COUNT_IPIS
   98 /* Interrupt counts. */
   99 static u_long *ipi_preempt_counts[MAXCPU];
  100 static u_long *ipi_ast_counts[MAXCPU];
  101 u_long *ipi_invltlb_counts[MAXCPU];
  102 u_long *ipi_invlrng_counts[MAXCPU];
  103 u_long *ipi_invlpg_counts[MAXCPU];
  104 u_long *ipi_invlcache_counts[MAXCPU];
  105 u_long *ipi_rendezvous_counts[MAXCPU];
  106 static u_long *ipi_hardclock_counts[MAXCPU];
  107 #endif
  108 
  109 /* Default cpu_ops implementation. */
  110 struct cpu_ops cpu_ops;
  111 
  112 /*
  113  * Local data and functions.
  114  */
  115 
  116 static volatile cpuset_t ipi_stop_nmi_pending;
  117 
  118 volatile cpuset_t resuming_cpus;
  119 volatile cpuset_t toresume_cpus;
  120 
  121 /* used to hold the AP's until we are ready to release them */
  122 struct mtx ap_boot_mtx;
  123 
  124 /* Set to 1 once we're ready to let the APs out of the pen. */
  125 volatile int aps_ready = 0;
  126 
  127 /*
  128  * Store data from cpu_add() until later in the boot when we actually setup
  129  * the APs.
  130  */
  131 struct cpu_info cpu_info[MAX_APIC_ID + 1];
  132 int apic_cpuids[MAX_APIC_ID + 1];
  133 int cpu_apic_ids[MAXCPU];
  134 
  135 /* Holds pending bitmap based IPIs per CPU */
  136 volatile u_int cpu_ipi_pending[MAXCPU];
  137 
  138 static void     release_aps(void *dummy);
  139 static void     cpustop_handler_post(u_int cpu);
  140 
  141 static int      hyperthreading_allowed = 1;
  142 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
  143         &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
  144 
  145 static struct topo_node topo_root;
  146 
  147 static int pkg_id_shift;
  148 static int core_id_shift;
  149 static int disabled_cpus;
  150 
  151 struct cache_info {
  152         int     id_shift;
  153         int     present;
  154 } static caches[MAX_CACHE_LEVELS];
  155 
  156 void
  157 mem_range_AP_init(void)
  158 {
  159 
  160         if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
  161                 mem_range_softc.mr_op->initAP(&mem_range_softc);
  162 }
  163 
  164 /*
  165  * Round up to the next power of two, if necessary, and then
  166  * take log2.
  167  * Returns -1 if argument is zero.
  168  */
  169 static __inline int
  170 mask_width(u_int x)
  171 {
  172 
  173         return (fls(x << (1 - powerof2(x))) - 1);
  174 }
  175 
  176 /*
  177  * Add a cache level to the cache topology description.
  178  */
  179 static int
  180 add_deterministic_cache(int type, int level, int share_count)
  181 {
  182 
  183         if (type == 0)
  184                 return (0);
  185         if (type > 3) {
  186                 printf("unexpected cache type %d\n", type);
  187                 return (1);
  188         }
  189         if (type == 2) /* ignore instruction cache */
  190                 return (1);
  191         if (level == 0 || level > MAX_CACHE_LEVELS) {
  192                 printf("unexpected cache level %d\n", type);
  193                 return (1);
  194         }
  195 
  196         if (caches[level - 1].present) {
  197                 printf("WARNING: multiple entries for L%u data cache\n", level);
  198                 printf("%u => %u\n", caches[level - 1].id_shift,
  199                     mask_width(share_count));
  200         }
  201         caches[level - 1].id_shift = mask_width(share_count);
  202         caches[level - 1].present = 1;
  203 
  204         if (caches[level - 1].id_shift > pkg_id_shift) {
  205                 printf("WARNING: L%u data cache covers more "
  206                     "APIC IDs than a package\n", level);
  207                 printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
  208                 caches[level - 1].id_shift = pkg_id_shift;
  209         }
  210         if (caches[level - 1].id_shift < core_id_shift) {
  211                 printf("WARNING: L%u data cache covers less "
  212                     "APIC IDs than a core\n", level);
  213                 printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
  214                 caches[level - 1].id_shift = core_id_shift;
  215         }
  216 
  217         return (1);
  218 }
  219 
  220 /*
  221  * Determine topology of processing units and caches for AMD CPUs.
  222  * See:
  223  *  - AMD CPUID Specification (Publication # 25481)
  224  *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
  225  *  - BKDG For AMD Family 10h Processors (Publication # 31116)
  226  *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
  227  *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
  228  *  - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
  229  */
  230 static void
  231 topo_probe_amd(void)
  232 {
  233         u_int p[4];
  234         uint64_t v;
  235         int level;
  236         int nodes_per_socket;
  237         int share_count;
  238         int type;
  239         int i;
  240 
  241         /* No multi-core capability. */
  242         if ((amd_feature2 & AMDID2_CMP) == 0)
  243                 return;
  244 
  245         /* For families 10h and newer. */
  246         pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
  247             AMDID_COREID_SIZE_SHIFT;
  248 
  249         /* For 0Fh family. */
  250         if (pkg_id_shift == 0)
  251                 pkg_id_shift =
  252                     mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
  253 
  254         /*
  255          * Families prior to 16h define the following value as
  256          * cores per compute unit and we don't really care about the AMD
  257          * compute units at the moment.  Perhaps we should treat them as
  258          * cores and cores within the compute units as hardware threads,
  259          * but that's up for debate.
  260          * Later families define the value as threads per compute unit,
  261          * so we are following AMD's nomenclature here.
  262          */
  263         if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
  264             CPUID_TO_FAMILY(cpu_id) >= 0x16) {
  265                 cpuid_count(0x8000001e, 0, p);
  266                 share_count = ((p[1] >> 8) & 0xff) + 1;
  267                 core_id_shift = mask_width(share_count);
  268         }
  269 
  270         if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
  271                 for (i = 0; ; i++) {
  272                         cpuid_count(0x8000001d, i, p);
  273                         type = p[0] & 0x1f;
  274                         level = (p[0] >> 5) & 0x7;
  275                         share_count = 1 + ((p[0] >> 14) & 0xfff);
  276 
  277                         if (!add_deterministic_cache(type, level, share_count))
  278                                 break;
  279                 }
  280         } else {
  281                 if (cpu_exthigh >= 0x80000005) {
  282                         cpuid_count(0x80000005, 0, p);
  283                         if (((p[2] >> 24) & 0xff) != 0) {
  284                                 caches[0].id_shift = 0;
  285                                 caches[0].present = 1;
  286                         }
  287                 }
  288                 if (cpu_exthigh >= 0x80000006) {
  289                         cpuid_count(0x80000006, 0, p);
  290                         if (((p[2] >> 16) & 0xffff) != 0) {
  291                                 caches[1].id_shift = 0;
  292                                 caches[1].present = 1;
  293                         }
  294                         if (((p[3] >> 18) & 0x3fff) != 0) {
  295                                 nodes_per_socket = 1;
  296                                 if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
  297                                         /*
  298                                          * Handle multi-node processors that
  299                                          * have multiple chips, each with its
  300                                          * own L3 cache, on the same die.
  301                                          */
  302                                         v = rdmsr(0xc001100c);
  303                                         nodes_per_socket = 1 + ((v >> 3) & 0x7);
  304                                 }
  305                                 caches[2].id_shift =
  306                                     pkg_id_shift - mask_width(nodes_per_socket);
  307                                 caches[2].present = 1;
  308                         }
  309                 }
  310         }
  311 }
  312 
  313 /*
  314  * Determine topology of processing units for Intel CPUs
  315  * using CPUID Leaf 1 and Leaf 4, if supported.
  316  * See:
  317  *  - Intel 64 Architecture Processor Topology Enumeration
  318  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  319  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  320  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  321  */
  322 static void
  323 topo_probe_intel_0x4(void)
  324 {
  325         u_int p[4];
  326         int max_cores;
  327         int max_logical;
  328 
  329         /* Both zero and one here mean one logical processor per package. */
  330         max_logical = (cpu_feature & CPUID_HTT) != 0 ?
  331             (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
  332         if (max_logical <= 1)
  333                 return;
  334 
  335         if (cpu_high >= 0x4) {
  336                 cpuid_count(0x04, 0, p);
  337                 max_cores = ((p[0] >> 26) & 0x3f) + 1;
  338         } else
  339                 max_cores = 1;
  340 
  341         core_id_shift = mask_width(max_logical/max_cores);
  342         KASSERT(core_id_shift >= 0,
  343             ("intel topo: max_cores > max_logical\n"));
  344         pkg_id_shift = core_id_shift + mask_width(max_cores);
  345 }
  346 
  347 /*
  348  * Determine topology of processing units for Intel CPUs
  349  * using CPUID Leaf 11, if supported.
  350  * See:
  351  *  - Intel 64 Architecture Processor Topology Enumeration
  352  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  353  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  354  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  355  */
  356 static void
  357 topo_probe_intel_0xb(void)
  358 {
  359         u_int p[4];
  360         int bits;
  361         int type;
  362         int i;
  363 
  364         /* Fall back if CPU leaf 11 doesn't really exist. */
  365         cpuid_count(0x0b, 0, p);
  366         if (p[1] == 0) {
  367                 topo_probe_intel_0x4();
  368                 return;
  369         }
  370 
  371         /* We only support three levels for now. */
  372         for (i = 0; ; i++) {
  373                 cpuid_count(0x0b, i, p);
  374 
  375                 bits = p[0] & 0x1f;
  376                 type = (p[2] >> 8) & 0xff;
  377 
  378                 if (type == 0)
  379                         break;
  380 
  381                 /* TODO: check for duplicate (re-)assignment */
  382                 if (type == CPUID_TYPE_SMT)
  383                         core_id_shift = bits;
  384                 else if (type == CPUID_TYPE_CORE)
  385                         pkg_id_shift = bits;
  386                 else
  387                         printf("unknown CPU level type %d\n", type);
  388         }
  389 
  390         if (pkg_id_shift < core_id_shift) {
  391                 printf("WARNING: core covers more APIC IDs than a package\n");
  392                 core_id_shift = pkg_id_shift;
  393         }
  394 }
  395 
  396 /*
  397  * Determine topology of caches for Intel CPUs.
  398  * See:
  399  *  - Intel 64 Architecture Processor Topology Enumeration
  400  *  - Intel 64 and IA-32 Architectures Software Developer’s Manual
  401  *    Volume 2A: Instruction Set Reference, A-M,
  402  *    CPUID instruction
  403  */
  404 static void
  405 topo_probe_intel_caches(void)
  406 {
  407         u_int p[4];
  408         int level;
  409         int share_count;
  410         int type;
  411         int i;
  412 
  413         if (cpu_high < 0x4) {
  414                 /*
  415                  * Available cache level and sizes can be determined
  416                  * via CPUID leaf 2, but that requires a huge table of hardcoded
  417                  * values, so for now just assume L1 and L2 caches potentially
  418                  * shared only by HTT processing units, if HTT is present.
  419                  */
  420                 caches[0].id_shift = pkg_id_shift;
  421                 caches[0].present = 1;
  422                 caches[1].id_shift = pkg_id_shift;
  423                 caches[1].present = 1;
  424                 return;
  425         }
  426 
  427         for (i = 0; ; i++) {
  428                 cpuid_count(0x4, i, p);
  429                 type = p[0] & 0x1f;
  430                 level = (p[0] >> 5) & 0x7;
  431                 share_count = 1 + ((p[0] >> 14) & 0xfff);
  432 
  433                 if (!add_deterministic_cache(type, level, share_count))
  434                         break;
  435         }
  436 }
  437 
  438 /*
  439  * Determine topology of processing units and caches for Intel CPUs.
  440  * See:
  441  *  - Intel 64 Architecture Processor Topology Enumeration
  442  */
  443 static void
  444 topo_probe_intel(void)
  445 {
  446 
  447         /*
  448          * Note that 0x1 <= cpu_high < 4 case should be
  449          * compatible with topo_probe_intel_0x4() logic when
  450          * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
  451          * or it should trigger the fallback otherwise.
  452          */
  453         if (cpu_high >= 0xb)
  454                 topo_probe_intel_0xb();
  455         else if (cpu_high >= 0x1)
  456                 topo_probe_intel_0x4();
  457 
  458         topo_probe_intel_caches();
  459 }
  460 
  461 /*
  462  * Topology information is queried only on BSP, on which this
  463  * code runs and for which it can query CPUID information.
  464  * Then topology is extrapolated on all packages using an
  465  * assumption that APIC ID to hardware component ID mapping is
  466  * homogenious.
  467  * That doesn't necesserily imply that the topology is uniform.
  468  */
  469 void
  470 topo_probe(void)
  471 {
  472         static int cpu_topo_probed = 0;
  473         struct x86_topo_layer {
  474                 int type;
  475                 int subtype;
  476                 int id_shift;
  477         } topo_layers[MAX_CACHE_LEVELS + 3];
  478         struct topo_node *parent;
  479         struct topo_node *node;
  480         int layer;
  481         int nlayers;
  482         int node_id;
  483         int i;
  484 
  485         if (cpu_topo_probed)
  486                 return;
  487 
  488         CPU_ZERO(&logical_cpus_mask);
  489 
  490         if (mp_ncpus <= 1)
  491                 ; /* nothing */
  492         else if (cpu_vendor_id == CPU_VENDOR_AMD)
  493                 topo_probe_amd();
  494         else if (cpu_vendor_id == CPU_VENDOR_INTEL)
  495                 topo_probe_intel();
  496 
  497         KASSERT(pkg_id_shift >= core_id_shift,
  498             ("bug in APIC topology discovery"));
  499 
  500         nlayers = 0;
  501         bzero(topo_layers, sizeof(topo_layers));
  502 
  503         topo_layers[nlayers].type = TOPO_TYPE_PKG;
  504         topo_layers[nlayers].id_shift = pkg_id_shift;
  505         if (bootverbose)
  506                 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
  507         nlayers++;
  508 
  509         /*
  510          * Consider all caches to be within a package/chip
  511          * and "in front" of all sub-components like
  512          * cores and hardware threads.
  513          */
  514         for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
  515                 if (caches[i].present) {
  516                         KASSERT(caches[i].id_shift <= pkg_id_shift,
  517                                 ("bug in APIC topology discovery"));
  518                         KASSERT(caches[i].id_shift >= core_id_shift,
  519                                 ("bug in APIC topology discovery"));
  520 
  521                         topo_layers[nlayers].type = TOPO_TYPE_CACHE;
  522                         topo_layers[nlayers].subtype = i + 1;
  523                         topo_layers[nlayers].id_shift = caches[i].id_shift;
  524                         if (bootverbose)
  525                                 printf("L%u cache ID shift: %u\n",
  526                                     topo_layers[nlayers].subtype,
  527                                     topo_layers[nlayers].id_shift);
  528                         nlayers++;
  529                 }
  530         }
  531 
  532         if (pkg_id_shift > core_id_shift) {
  533                 topo_layers[nlayers].type = TOPO_TYPE_CORE;
  534                 topo_layers[nlayers].id_shift = core_id_shift;
  535                 if (bootverbose)
  536                         printf("Core ID shift: %u\n",
  537                             topo_layers[nlayers].id_shift);
  538                 nlayers++;
  539         }
  540 
  541         topo_layers[nlayers].type = TOPO_TYPE_PU;
  542         topo_layers[nlayers].id_shift = 0;
  543         nlayers++;
  544 
  545         topo_init_root(&topo_root);
  546         for (i = 0; i <= MAX_APIC_ID; ++i) {
  547                 if (!cpu_info[i].cpu_present)
  548                         continue;
  549 
  550                 parent = &topo_root;
  551                 for (layer = 0; layer < nlayers; ++layer) {
  552                         node_id = i >> topo_layers[layer].id_shift;
  553                         parent = topo_add_node_by_hwid(parent, node_id,
  554                             topo_layers[layer].type,
  555                             topo_layers[layer].subtype);
  556                 }
  557         }
  558 
  559         parent = &topo_root;
  560         for (layer = 0; layer < nlayers; ++layer) {
  561                 node_id = boot_cpu_id >> topo_layers[layer].id_shift;
  562                 node = topo_find_node_by_hwid(parent, node_id,
  563                     topo_layers[layer].type,
  564                     topo_layers[layer].subtype);
  565                 topo_promote_child(node);
  566                 parent = node;
  567         }
  568 
  569         cpu_topo_probed = 1;
  570 }
  571 
  572 /*
  573  * Assign logical CPU IDs to local APICs.
  574  */
  575 void
  576 assign_cpu_ids(void)
  577 {
  578         struct topo_node *node;
  579         u_int smt_mask;
  580 
  581         smt_mask = (1u << core_id_shift) - 1;
  582 
  583         /*
  584          * Assign CPU IDs to local APIC IDs and disable any CPUs
  585          * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
  586          */
  587         mp_ncpus = 0;
  588         TOPO_FOREACH(node, &topo_root) {
  589                 if (node->type != TOPO_TYPE_PU)
  590                         continue;
  591 
  592                 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
  593                         cpu_info[node->hwid].cpu_hyperthread = 1;
  594 
  595                 if (resource_disabled("lapic", node->hwid)) {
  596                         if (node->hwid != boot_cpu_id)
  597                                 cpu_info[node->hwid].cpu_disabled = 1;
  598                         else
  599                                 printf("Cannot disable BSP, APIC ID = %d\n",
  600                                     node->hwid);
  601                 }
  602 
  603                 if (!hyperthreading_allowed &&
  604                     cpu_info[node->hwid].cpu_hyperthread)
  605                         cpu_info[node->hwid].cpu_disabled = 1;
  606 
  607                 if (mp_ncpus >= MAXCPU)
  608                         cpu_info[node->hwid].cpu_disabled = 1;
  609 
  610                 if (cpu_info[node->hwid].cpu_disabled) {
  611                         disabled_cpus++;
  612                         continue;
  613                 }
  614 
  615                 cpu_apic_ids[mp_ncpus] = node->hwid;
  616                 apic_cpuids[node->hwid] = mp_ncpus;
  617                 topo_set_pu_id(node, mp_ncpus);
  618                 mp_ncpus++;
  619         }
  620 
  621         KASSERT(mp_maxid >= mp_ncpus - 1,
  622             ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
  623             mp_ncpus));
  624 }
  625 
  626 /*
  627  * Print various information about the SMP system hardware and setup.
  628  */
  629 void
  630 cpu_mp_announce(void)
  631 {
  632         struct topo_node *node;
  633         const char *hyperthread;
  634         int pkg_count;
  635         int cores_per_pkg;
  636         int thrs_per_core;
  637 
  638         printf("FreeBSD/SMP: ");
  639         if (topo_analyze(&topo_root, 1, &pkg_count,
  640             &cores_per_pkg, &thrs_per_core)) {
  641                 printf("%d package(s)", pkg_count);
  642                 if (cores_per_pkg > 0)
  643                         printf(" x %d core(s)", cores_per_pkg);
  644                 if (thrs_per_core > 1)
  645                     printf(" x %d hardware threads", thrs_per_core);
  646         } else {
  647                 printf("Non-uniform topology");
  648         }
  649         printf("\n");
  650 
  651         if (disabled_cpus) {
  652                 printf("FreeBSD/SMP Online: ");
  653                 if (topo_analyze(&topo_root, 0, &pkg_count,
  654                     &cores_per_pkg, &thrs_per_core)) {
  655                         printf("%d package(s)", pkg_count);
  656                         if (cores_per_pkg > 0)
  657                                 printf(" x %d core(s)", cores_per_pkg);
  658                         if (thrs_per_core > 1)
  659                             printf(" x %d hardware threads", thrs_per_core);
  660                 } else {
  661                         printf("Non-uniform topology");
  662                 }
  663                 printf("\n");
  664         }
  665 
  666         if (!bootverbose)
  667                 return;
  668 
  669         TOPO_FOREACH(node, &topo_root) {
  670                 switch (node->type) {
  671                 case TOPO_TYPE_PKG:
  672                         printf("Package HW ID = %u (%#x)\n",
  673                             node->hwid, node->hwid);
  674                         break;
  675                 case TOPO_TYPE_CORE:
  676                         printf("\tCore HW ID = %u (%#x)\n",
  677                             node->hwid, node->hwid);
  678                         break;
  679                 case TOPO_TYPE_PU:
  680                         if (cpu_info[node->hwid].cpu_hyperthread)
  681                                 hyperthread = "/HT";
  682                         else
  683                                 hyperthread = "";
  684 
  685                         if (node->subtype == 0)
  686                                 printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
  687                                     "(disabled)\n", hyperthread, node->hwid,
  688                                     node->hwid);
  689                         else if (node->id == 0)
  690                                 printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
  691                                     node->hwid, node->hwid);
  692                         else
  693                                 printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
  694                                     node->id, hyperthread, node->hwid,
  695                                     node->hwid);
  696                         break;
  697                 default:
  698                         /* ignored */
  699                         break;
  700                 }
  701         }
  702 }
  703 
  704 /*
  705  * Add a scheduling group, a group of logical processors sharing
  706  * a particular cache (and, thus having an affinity), to the scheduling
  707  * topology.
  708  * This function recursively works on lower level caches.
  709  */
  710 static void
  711 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
  712 {
  713         struct topo_node *node;
  714         int nchildren;
  715         int ncores;
  716         int i;
  717 
  718         KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
  719             ("x86topo_add_sched_group: bad type: %u", root->type));
  720         CPU_COPY(&root->cpuset, &cg_root->cg_mask);
  721         cg_root->cg_count = root->cpu_count;
  722         if (root->type == TOPO_TYPE_SYSTEM)
  723                 cg_root->cg_level = CG_SHARE_NONE;
  724         else
  725                 cg_root->cg_level = root->subtype;
  726 
  727         /*
  728          * Check how many core nodes we have under the given root node.
  729          * If we have multiple logical processors, but not multiple
  730          * cores, then those processors must be hardware threads.
  731          */
  732         ncores = 0;
  733         node = root;
  734         while (node != NULL) {
  735                 if (node->type != TOPO_TYPE_CORE) {
  736                         node = topo_next_node(root, node);
  737                         continue;
  738                 }
  739 
  740                 ncores++;
  741                 node = topo_next_nonchild_node(root, node);
  742         }
  743 
  744         if (cg_root->cg_level != CG_SHARE_NONE &&
  745             root->cpu_count > 1 && ncores < 2)
  746                 cg_root->cg_flags = CG_FLAG_SMT;
  747 
  748         /*
  749          * Find out how many cache nodes we have under the given root node.
  750          * We ignore cache nodes that cover all the same processors as the
  751          * root node.  Also, we do not descend below found cache nodes.
  752          * That is, we count top-level "non-redundant" caches under the root
  753          * node.
  754          */
  755         nchildren = 0;
  756         node = root;
  757         while (node != NULL) {
  758                 if (node->type != TOPO_TYPE_CACHE ||
  759                     (root->type != TOPO_TYPE_SYSTEM &&
  760                     CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
  761                         node = topo_next_node(root, node);
  762                         continue;
  763                 }
  764                 nchildren++;
  765                 node = topo_next_nonchild_node(root, node);
  766         }
  767 
  768         cg_root->cg_child = smp_topo_alloc(nchildren);
  769         cg_root->cg_children = nchildren;
  770 
  771         /*
  772          * Now find again the same cache nodes as above and recursively
  773          * build scheduling topologies for them.
  774          */
  775         node = root;
  776         i = 0;
  777         while (node != NULL) {
  778                 if (node->type != TOPO_TYPE_CACHE ||
  779                     (root->type != TOPO_TYPE_SYSTEM &&
  780                     CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
  781                         node = topo_next_node(root, node);
  782                         continue;
  783                 }
  784                 cg_root->cg_child[i].cg_parent = cg_root;
  785                 x86topo_add_sched_group(node, &cg_root->cg_child[i]);
  786                 i++;
  787                 node = topo_next_nonchild_node(root, node);
  788         }
  789 }
  790 
  791 /*
  792  * Build the MI scheduling topology from the discovered hardware topology.
  793  */
  794 struct cpu_group *
  795 cpu_topo(void)
  796 {
  797         struct cpu_group *cg_root;
  798 
  799         if (mp_ncpus <= 1)
  800                 return (smp_topo_none());
  801 
  802         cg_root = smp_topo_alloc(1);
  803         x86topo_add_sched_group(&topo_root, cg_root);
  804         return (cg_root);
  805 }
  806 
  807 
  808 /*
  809  * Add a logical CPU to the topology.
  810  */
  811 void
  812 cpu_add(u_int apic_id, char boot_cpu)
  813 {
  814 
  815         if (apic_id > MAX_APIC_ID) {
  816                 panic("SMP: APIC ID %d too high", apic_id);
  817                 return;
  818         }
  819         KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
  820             apic_id));
  821         cpu_info[apic_id].cpu_present = 1;
  822         if (boot_cpu) {
  823                 KASSERT(boot_cpu_id == -1,
  824                     ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
  825                     boot_cpu_id));
  826                 boot_cpu_id = apic_id;
  827                 cpu_info[apic_id].cpu_bsp = 1;
  828         }
  829         if (mp_ncpus < MAXCPU) {
  830                 mp_ncpus++;
  831                 mp_maxid = mp_ncpus - 1;
  832         }
  833         if (bootverbose)
  834                 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
  835                     "AP");
  836 }
  837 
  838 void
  839 cpu_mp_setmaxid(void)
  840 {
  841 
  842         /*
  843          * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
  844          * If there were no calls to cpu_add() assume this is a UP system.
  845          */
  846         if (mp_ncpus == 0)
  847                 mp_ncpus = 1;
  848 }
  849 
  850 int
  851 cpu_mp_probe(void)
  852 {
  853 
  854         /*
  855          * Always record BSP in CPU map so that the mbuf init code works
  856          * correctly.
  857          */
  858         CPU_SETOF(0, &all_cpus);
  859         return (mp_ncpus > 1);
  860 }
  861 
  862 /*
  863  * AP CPU's call this to initialize themselves.
  864  */
  865 void
  866 init_secondary_tail(void)
  867 {
  868         u_int cpuid;
  869 
  870         pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
  871 
  872         /*
  873          * On real hardware, switch to x2apic mode if possible.  Do it
  874          * after aps_ready was signalled, to avoid manipulating the
  875          * mode while BSP might still want to send some IPI to us
  876          * (second startup IPI is ignored on modern hardware etc).
  877          */
  878         lapic_xapic_mode();
  879 
  880         /* Initialize the PAT MSR. */
  881         pmap_init_pat();
  882 
  883         /* set up CPU registers and state */
  884         cpu_setregs();
  885 
  886         /* set up SSE/NX */
  887         initializecpu();
  888 
  889         /* set up FPU state on the AP */
  890 #ifdef __amd64__
  891         fpuinit();
  892 #else
  893         npxinit(false);
  894 #endif
  895 
  896         if (cpu_ops.cpu_init)
  897                 cpu_ops.cpu_init();
  898 
  899         /* A quick check from sanity claus */
  900         cpuid = PCPU_GET(cpuid);
  901         if (PCPU_GET(apic_id) != lapic_id()) {
  902                 printf("SMP: cpuid = %d\n", cpuid);
  903                 printf("SMP: actual apic_id = %d\n", lapic_id());
  904                 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
  905                 panic("cpuid mismatch! boom!!");
  906         }
  907 
  908         /* Initialize curthread. */
  909         KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
  910         PCPU_SET(curthread, PCPU_GET(idlethread));
  911 
  912         mtx_lock_spin(&ap_boot_mtx);
  913 
  914         mca_init();
  915 
  916         /* Init local apic for irq's */
  917         lapic_setup(1);
  918 
  919         /* Set memory range attributes for this CPU to match the BSP */
  920         mem_range_AP_init();
  921 
  922         smp_cpus++;
  923 
  924         CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
  925         printf("SMP: AP CPU #%d Launched!\n", cpuid);
  926 
  927         /* Determine if we are a logical CPU. */
  928         if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
  929                 CPU_SET(cpuid, &logical_cpus_mask);
  930 
  931         if (bootverbose)
  932                 lapic_dump("AP");
  933 
  934         if (smp_cpus == mp_ncpus) {
  935                 /* enable IPI's, tlb shootdown, freezes etc */
  936                 atomic_store_rel_int(&smp_started, 1);
  937         }
  938 
  939 #ifdef __amd64__
  940         /*
  941          * Enable global pages TLB extension
  942          * This also implicitly flushes the TLB 
  943          */
  944         load_cr4(rcr4() | CR4_PGE);
  945         if (pmap_pcid_enabled)
  946                 load_cr4(rcr4() | CR4_PCIDE);
  947         load_ds(_udatasel);
  948         load_es(_udatasel);
  949         load_fs(_ufssel);
  950 #endif
  951 
  952         mtx_unlock_spin(&ap_boot_mtx);
  953 
  954         /* Wait until all the AP's are up. */
  955         while (atomic_load_acq_int(&smp_started) == 0)
  956                 ia32_pause();
  957 
  958 #ifndef EARLY_AP_STARTUP
  959         /* Start per-CPU event timers. */
  960         cpu_initclocks_ap();
  961 #endif
  962 
  963         sched_throw(NULL);
  964 
  965         panic("scheduler returned us to %s", __func__);
  966         /* NOTREACHED */
  967 }
  968 
  969 /*******************************************************************
  970  * local functions and data
  971  */
  972 
  973 /*
  974  * We tell the I/O APIC code about all the CPUs we want to receive
  975  * interrupts.  If we don't want certain CPUs to receive IRQs we
  976  * can simply not tell the I/O APIC code about them in this function.
  977  * We also do not tell it about the BSP since it tells itself about
  978  * the BSP internally to work with UP kernels and on UP machines.
  979  */
  980 void
  981 set_interrupt_apic_ids(void)
  982 {
  983         u_int i, apic_id;
  984 
  985         for (i = 0; i < MAXCPU; i++) {
  986                 apic_id = cpu_apic_ids[i];
  987                 if (apic_id == -1)
  988                         continue;
  989                 if (cpu_info[apic_id].cpu_bsp)
  990                         continue;
  991                 if (cpu_info[apic_id].cpu_disabled)
  992                         continue;
  993 
  994                 /* Don't let hyperthreads service interrupts. */
  995                 if (cpu_info[apic_id].cpu_hyperthread)
  996                         continue;
  997 
  998                 intr_add_cpu(i);
  999         }
 1000 }
 1001 
 1002 
 1003 #ifdef COUNT_XINVLTLB_HITS
 1004 u_int xhits_gbl[MAXCPU];
 1005 u_int xhits_pg[MAXCPU];
 1006 u_int xhits_rng[MAXCPU];
 1007 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
 1008 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
 1009     sizeof(xhits_gbl), "IU", "");
 1010 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
 1011     sizeof(xhits_pg), "IU", "");
 1012 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
 1013     sizeof(xhits_rng), "IU", "");
 1014 
 1015 u_int ipi_global;
 1016 u_int ipi_page;
 1017 u_int ipi_range;
 1018 u_int ipi_range_size;
 1019 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 1020 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 1021 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 1022 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
 1023     0, "");
 1024 #endif /* COUNT_XINVLTLB_HITS */
 1025 
 1026 /*
 1027  * Init and startup IPI.
 1028  */
 1029 void
 1030 ipi_startup(int apic_id, int vector)
 1031 {
 1032 
 1033         /*
 1034          * This attempts to follow the algorithm described in the
 1035          * Intel Multiprocessor Specification v1.4 in section B.4.
 1036          * For each IPI, we allow the local APIC ~20us to deliver the
 1037          * IPI.  If that times out, we panic.
 1038          */
 1039 
 1040         /*
 1041          * first we do an INIT IPI: this INIT IPI might be run, resetting
 1042          * and running the target CPU. OR this INIT IPI might be latched (P5
 1043          * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 1044          * ignored.
 1045          */
 1046         lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 1047             APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 1048         lapic_ipi_wait(100);
 1049 
 1050         /* Explicitly deassert the INIT IPI. */
 1051         lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 1052             APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
 1053             apic_id);
 1054 
 1055         DELAY(10000);           /* wait ~10mS */
 1056 
 1057         /*
 1058          * next we do a STARTUP IPI: the previous INIT IPI might still be
 1059          * latched, (P5 bug) this 1st STARTUP would then terminate
 1060          * immediately, and the previously started INIT IPI would continue. OR
 1061          * the previous INIT IPI has already run. and this STARTUP IPI will
 1062          * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 1063          * will run.
 1064          */
 1065         lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 1066             APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 1067             vector, apic_id);
 1068         if (!lapic_ipi_wait(100))
 1069                 panic("Failed to deliver first STARTUP IPI to APIC %d",
 1070                     apic_id);
 1071         DELAY(200);             /* wait ~200uS */
 1072 
 1073         /*
 1074          * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 1075          * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 1076          * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 1077          * recognized after hardware RESET or INIT IPI.
 1078          */
 1079         lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 1080             APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 1081             vector, apic_id);
 1082         if (!lapic_ipi_wait(100))
 1083                 panic("Failed to deliver second STARTUP IPI to APIC %d",
 1084                     apic_id);
 1085 
 1086         DELAY(200);             /* wait ~200uS */
 1087 }
 1088 
 1089 /*
 1090  * Send an IPI to specified CPU handling the bitmap logic.
 1091  */
 1092 void
 1093 ipi_send_cpu(int cpu, u_int ipi)
 1094 {
 1095         u_int bitmap, old_pending, new_pending;
 1096 
 1097         KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
 1098 
 1099         if (IPI_IS_BITMAPED(ipi)) {
 1100                 bitmap = 1 << ipi;
 1101                 ipi = IPI_BITMAP_VECTOR;
 1102                 do {
 1103                         old_pending = cpu_ipi_pending[cpu];
 1104                         new_pending = old_pending | bitmap;
 1105                 } while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
 1106                     old_pending, new_pending)); 
 1107                 if (old_pending)
 1108                         return;
 1109         }
 1110         lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 1111 }
 1112 
 1113 void
 1114 ipi_bitmap_handler(struct trapframe frame)
 1115 {
 1116         struct trapframe *oldframe;
 1117         struct thread *td;
 1118         int cpu = PCPU_GET(cpuid);
 1119         u_int ipi_bitmap;
 1120 
 1121         critical_enter();
 1122         td = curthread;
 1123         td->td_intr_nesting_level++;
 1124         oldframe = td->td_intr_frame;
 1125         td->td_intr_frame = &frame;
 1126         ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
 1127         if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 1128 #ifdef COUNT_IPIS
 1129                 (*ipi_preempt_counts[cpu])++;
 1130 #endif
 1131                 sched_preempt(td);
 1132         }
 1133         if (ipi_bitmap & (1 << IPI_AST)) {
 1134 #ifdef COUNT_IPIS
 1135                 (*ipi_ast_counts[cpu])++;
 1136 #endif
 1137                 /* Nothing to do for AST */
 1138         }
 1139         if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
 1140 #ifdef COUNT_IPIS
 1141                 (*ipi_hardclock_counts[cpu])++;
 1142 #endif
 1143                 hardclockintr();
 1144         }
 1145         td->td_intr_frame = oldframe;
 1146         td->td_intr_nesting_level--;
 1147         critical_exit();
 1148 }
 1149 
 1150 /*
 1151  * send an IPI to a set of cpus.
 1152  */
 1153 void
 1154 ipi_selected(cpuset_t cpus, u_int ipi)
 1155 {
 1156         int cpu;
 1157 
 1158         /*
 1159          * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 1160          * of help in order to understand what is the source.
 1161          * Set the mask of receiving CPUs for this purpose.
 1162          */
 1163         if (ipi == IPI_STOP_HARD)
 1164                 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
 1165 
 1166         while ((cpu = CPU_FFS(&cpus)) != 0) {
 1167                 cpu--;
 1168                 CPU_CLR(cpu, &cpus);
 1169                 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 1170                 ipi_send_cpu(cpu, ipi);
 1171         }
 1172 }
 1173 
 1174 /*
 1175  * send an IPI to a specific CPU.
 1176  */
 1177 void
 1178 ipi_cpu(int cpu, u_int ipi)
 1179 {
 1180 
 1181         /*
 1182          * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 1183          * of help in order to understand what is the source.
 1184          * Set the mask of receiving CPUs for this purpose.
 1185          */
 1186         if (ipi == IPI_STOP_HARD)
 1187                 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
 1188 
 1189         CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 1190         ipi_send_cpu(cpu, ipi);
 1191 }
 1192 
 1193 /*
 1194  * send an IPI to all CPUs EXCEPT myself
 1195  */
 1196 void
 1197 ipi_all_but_self(u_int ipi)
 1198 {
 1199         cpuset_t other_cpus;
 1200 
 1201         other_cpus = all_cpus;
 1202         CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 1203         if (IPI_IS_BITMAPED(ipi)) {
 1204                 ipi_selected(other_cpus, ipi);
 1205                 return;
 1206         }
 1207 
 1208         /*
 1209          * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 1210          * of help in order to understand what is the source.
 1211          * Set the mask of receiving CPUs for this purpose.
 1212          */
 1213         if (ipi == IPI_STOP_HARD)
 1214                 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
 1215 
 1216         CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 1217         lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 1218 }
 1219 
 1220 int
 1221 ipi_nmi_handler(void)
 1222 {
 1223         u_int cpuid;
 1224 
 1225         /*
 1226          * As long as there is not a simple way to know about a NMI's
 1227          * source, if the bitmask for the current CPU is present in
 1228          * the global pending bitword an IPI_STOP_HARD has been issued
 1229          * and should be handled.
 1230          */
 1231         cpuid = PCPU_GET(cpuid);
 1232         if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
 1233                 return (1);
 1234 
 1235         CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
 1236         cpustop_handler();
 1237         return (0);
 1238 }
 1239 
 1240 int nmi_kdb_lock;
 1241 
 1242 void
 1243 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
 1244 {
 1245         int cpu;
 1246         bool call_post;
 1247 
 1248         cpu = PCPU_GET(cpuid);
 1249         if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
 1250                 nmi_call_kdb(cpu, type, frame);
 1251                 call_post = false;
 1252         } else {
 1253                 savectx(&stoppcbs[cpu]);
 1254                 CPU_SET_ATOMIC(cpu, &stopped_cpus);
 1255                 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
 1256                         ia32_pause();
 1257                 call_post = true;
 1258         }
 1259         atomic_store_rel_int(&nmi_kdb_lock, 0);
 1260         if (call_post)
 1261                 cpustop_handler_post(cpu);
 1262 }
 1263 
 1264 /*
 1265  * Handle an IPI_STOP by saving our current context and spinning until we
 1266  * are resumed.
 1267  */
 1268 void
 1269 cpustop_handler(void)
 1270 {
 1271         u_int cpu;
 1272 
 1273         cpu = PCPU_GET(cpuid);
 1274 
 1275         savectx(&stoppcbs[cpu]);
 1276 
 1277         /* Indicate that we are stopped */
 1278         CPU_SET_ATOMIC(cpu, &stopped_cpus);
 1279 
 1280         /* Wait for restart */
 1281         while (!CPU_ISSET(cpu, &started_cpus))
 1282             ia32_pause();
 1283 
 1284         cpustop_handler_post(cpu);
 1285 }
 1286 
 1287 static void
 1288 cpustop_handler_post(u_int cpu)
 1289 {
 1290 
 1291         CPU_CLR_ATOMIC(cpu, &started_cpus);
 1292         CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 1293 
 1294 #if defined(__amd64__) && defined(DDB)
 1295         amd64_db_resume_dbreg();
 1296 #endif
 1297 
 1298         if (cpu == 0 && cpustop_restartfunc != NULL) {
 1299                 cpustop_restartfunc();
 1300                 cpustop_restartfunc = NULL;
 1301         }
 1302 }
 1303 
 1304 /*
 1305  * Handle an IPI_SUSPEND by saving our current context and spinning until we
 1306  * are resumed.
 1307  */
 1308 void
 1309 cpususpend_handler(void)
 1310 {
 1311         u_int cpu;
 1312 
 1313         mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
 1314 
 1315         cpu = PCPU_GET(cpuid);
 1316         if (savectx(&susppcbs[cpu]->sp_pcb)) {
 1317 #ifdef __amd64__
 1318                 fpususpend(susppcbs[cpu]->sp_fpususpend);
 1319 #else
 1320                 npxsuspend(susppcbs[cpu]->sp_fpususpend);
 1321 #endif
 1322                 /*
 1323                  * suspended_cpus is cleared shortly after each AP is restarted
 1324                  * by a Startup IPI, so that the BSP can proceed to restarting
 1325                  * the next AP.
 1326                  *
 1327                  * resuming_cpus gets cleared when the AP completes
 1328                  * initialization after having been released by the BSP.
 1329                  * resuming_cpus is probably not the best name for the
 1330                  * variable, because it is actually a set of processors that
 1331                  * haven't resumed yet and haven't necessarily started resuming.
 1332                  *
 1333                  * Note that suspended_cpus is meaningful only for ACPI suspend
 1334                  * as it's not really used for Xen suspend since the APs are
 1335                  * automatically restored to the running state and the correct
 1336                  * context.  For the same reason resumectx is never called in
 1337                  * that case.
 1338                  */
 1339                 CPU_SET_ATOMIC(cpu, &suspended_cpus);
 1340                 CPU_SET_ATOMIC(cpu, &resuming_cpus);
 1341 
 1342                 /*
 1343                  * Invalidate the cache after setting the global status bits.
 1344                  * The last AP to set its bit may end up being an Owner of the
 1345                  * corresponding cache line in MOESI protocol.  The AP may be
 1346                  * stopped before the cache line is written to the main memory.
 1347                  */
 1348                 wbinvd();
 1349         } else {
 1350 #ifdef __amd64__
 1351                 fpuresume(susppcbs[cpu]->sp_fpususpend);
 1352 #else
 1353                 npxresume(susppcbs[cpu]->sp_fpususpend);
 1354 #endif
 1355                 pmap_init_pat();
 1356                 initializecpu();
 1357                 PCPU_SET(switchtime, 0);
 1358                 PCPU_SET(switchticks, ticks);
 1359 
 1360                 /* Indicate that we have restarted and restored the context. */
 1361                 CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 1362         }
 1363 
 1364         /* Wait for resume directive */
 1365         while (!CPU_ISSET(cpu, &toresume_cpus))
 1366                 ia32_pause();
 1367 
 1368         /* Re-apply microcode updates. */
 1369         ucode_reload();
 1370 
 1371         if (cpu_ops.cpu_resume)
 1372                 cpu_ops.cpu_resume();
 1373 #ifdef __amd64__
 1374         if (vmm_resume_p)
 1375                 vmm_resume_p();
 1376 #endif
 1377 
 1378         /* Resume MCA and local APIC */
 1379         lapic_xapic_mode();
 1380         mca_resume();
 1381         lapic_setup(0);
 1382 
 1383         /* Indicate that we are resumed */
 1384         CPU_CLR_ATOMIC(cpu, &resuming_cpus);
 1385         CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 1386         CPU_CLR_ATOMIC(cpu, &toresume_cpus);
 1387 }
 1388 
 1389 
 1390 void
 1391 invlcache_handler(void)
 1392 {
 1393         uint32_t generation;
 1394 
 1395 #ifdef COUNT_IPIS
 1396         (*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 1397 #endif /* COUNT_IPIS */
 1398 
 1399         /*
 1400          * Reading the generation here allows greater parallelism
 1401          * since wbinvd is a serializing instruction.  Without the
 1402          * temporary, we'd wait for wbinvd to complete, then the read
 1403          * would execute, then the dependent write, which must then
 1404          * complete before return from interrupt.
 1405          */
 1406         generation = smp_tlb_generation;
 1407         wbinvd();
 1408         PCPU_SET(smp_tlb_done, generation);
 1409 }
 1410 
 1411 /*
 1412  * This is called once the rest of the system is up and running and we're
 1413  * ready to let the AP's out of the pen.
 1414  */
 1415 static void
 1416 release_aps(void *dummy __unused)
 1417 {
 1418 
 1419         if (mp_ncpus == 1) 
 1420                 return;
 1421         atomic_store_rel_int(&aps_ready, 1);
 1422         while (smp_started == 0)
 1423                 ia32_pause();
 1424 }
 1425 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 1426 
 1427 #ifdef COUNT_IPIS
 1428 /*
 1429  * Setup interrupt counters for IPI handlers.
 1430  */
 1431 static void
 1432 mp_ipi_intrcnt(void *dummy)
 1433 {
 1434         char buf[64];
 1435         int i;
 1436 
 1437         CPU_FOREACH(i) {
 1438                 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
 1439                 intrcnt_add(buf, &ipi_invltlb_counts[i]);
 1440                 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
 1441                 intrcnt_add(buf, &ipi_invlrng_counts[i]);
 1442                 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
 1443                 intrcnt_add(buf, &ipi_invlpg_counts[i]);
 1444                 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
 1445                 intrcnt_add(buf, &ipi_invlcache_counts[i]);
 1446                 snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
 1447                 intrcnt_add(buf, &ipi_preempt_counts[i]);
 1448                 snprintf(buf, sizeof(buf), "cpu%d:ast", i);
 1449                 intrcnt_add(buf, &ipi_ast_counts[i]);
 1450                 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
 1451                 intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 1452                 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
 1453                 intrcnt_add(buf, &ipi_hardclock_counts[i]);
 1454         }               
 1455 }
 1456 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 1457 #endif
 1458 
 1459 /*
 1460  * Flush the TLB on other CPU's
 1461  */
 1462 
 1463 /* Variables needed for SMP tlb shootdown. */
 1464 vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 1465 pmap_t smp_tlb_pmap;
 1466 volatile uint32_t smp_tlb_generation;
 1467 
 1468 #ifdef __amd64__
 1469 #define read_eflags() read_rflags()
 1470 #endif
 1471 
 1472 static void
 1473 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
 1474     vm_offset_t addr1, vm_offset_t addr2)
 1475 {
 1476         cpuset_t other_cpus;
 1477         volatile uint32_t *p_cpudone;
 1478         uint32_t generation;
 1479         int cpu;
 1480 
 1481         /*
 1482          * Check for other cpus.  Return if none.
 1483          */
 1484         if (CPU_ISFULLSET(&mask)) {
 1485                 if (mp_ncpus <= 1)
 1486                         return;
 1487         } else {
 1488                 CPU_CLR(PCPU_GET(cpuid), &mask);
 1489                 if (CPU_EMPTY(&mask))
 1490                         return;
 1491         }
 1492 
 1493         if (!(read_eflags() & PSL_I))
 1494                 panic("%s: interrupts disabled", __func__);
 1495         mtx_lock_spin(&smp_ipi_mtx);
 1496         smp_tlb_addr1 = addr1;
 1497         smp_tlb_addr2 = addr2;
 1498         smp_tlb_pmap = pmap;
 1499         generation = ++smp_tlb_generation;
 1500         if (CPU_ISFULLSET(&mask)) {
 1501                 ipi_all_but_self(vector);
 1502                 other_cpus = all_cpus;
 1503                 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 1504         } else {
 1505                 other_cpus = mask;
 1506                 while ((cpu = CPU_FFS(&mask)) != 0) {
 1507                         cpu--;
 1508                         CPU_CLR(cpu, &mask);
 1509                         CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
 1510                             cpu, vector);
 1511                         ipi_send_cpu(cpu, vector);
 1512                 }
 1513         }
 1514         while ((cpu = CPU_FFS(&other_cpus)) != 0) {
 1515                 cpu--;
 1516                 CPU_CLR(cpu, &other_cpus);
 1517                 p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
 1518                 while (*p_cpudone != generation)
 1519                         ia32_pause();
 1520         }
 1521         mtx_unlock_spin(&smp_ipi_mtx);
 1522 }
 1523 
 1524 void
 1525 smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
 1526 {
 1527 
 1528         if (smp_started) {
 1529                 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
 1530 #ifdef COUNT_XINVLTLB_HITS
 1531                 ipi_global++;
 1532 #endif
 1533         }
 1534 }
 1535 
 1536 void
 1537 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
 1538 {
 1539 
 1540         if (smp_started) {
 1541                 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
 1542 #ifdef COUNT_XINVLTLB_HITS
 1543                 ipi_page++;
 1544 #endif
 1545         }
 1546 }
 1547 
 1548 void
 1549 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
 1550     pmap_t pmap)
 1551 {
 1552 
 1553         if (smp_started) {
 1554                 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
 1555                     addr1, addr2);
 1556 #ifdef COUNT_XINVLTLB_HITS
 1557                 ipi_range++;
 1558                 ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 1559 #endif
 1560         }
 1561 }
 1562 
 1563 void
 1564 smp_cache_flush(void)
 1565 {
 1566 
 1567         if (smp_started) {
 1568                 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
 1569                     0, 0);
 1570         }
 1571 }
 1572 
 1573 /*
 1574  * Handlers for TLB related IPIs
 1575  */
 1576 void
 1577 invltlb_handler(void)
 1578 {
 1579         uint32_t generation;
 1580   
 1581 #ifdef COUNT_XINVLTLB_HITS
 1582         xhits_gbl[PCPU_GET(cpuid)]++;
 1583 #endif /* COUNT_XINVLTLB_HITS */
 1584 #ifdef COUNT_IPIS
 1585         (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 1586 #endif /* COUNT_IPIS */
 1587 
 1588         /*
 1589          * Reading the generation here allows greater parallelism
 1590          * since invalidating the TLB is a serializing operation.
 1591          */
 1592         generation = smp_tlb_generation;
 1593         if (smp_tlb_pmap == kernel_pmap)
 1594                 invltlb_glob();
 1595         else
 1596                 invltlb();
 1597         PCPU_SET(smp_tlb_done, generation);
 1598 }
 1599 
 1600 void
 1601 invlpg_handler(void)
 1602 {
 1603         uint32_t generation;
 1604 
 1605 #ifdef COUNT_XINVLTLB_HITS
 1606         xhits_pg[PCPU_GET(cpuid)]++;
 1607 #endif /* COUNT_XINVLTLB_HITS */
 1608 #ifdef COUNT_IPIS
 1609         (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 1610 #endif /* COUNT_IPIS */
 1611 
 1612         generation = smp_tlb_generation;        /* Overlap with serialization */
 1613         invlpg(smp_tlb_addr1);
 1614         PCPU_SET(smp_tlb_done, generation);
 1615 }
 1616 
 1617 void
 1618 invlrng_handler(void)
 1619 {
 1620         vm_offset_t addr, addr2;
 1621         uint32_t generation;
 1622 
 1623 #ifdef COUNT_XINVLTLB_HITS
 1624         xhits_rng[PCPU_GET(cpuid)]++;
 1625 #endif /* COUNT_XINVLTLB_HITS */
 1626 #ifdef COUNT_IPIS
 1627         (*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 1628 #endif /* COUNT_IPIS */
 1629 
 1630         addr = smp_tlb_addr1;
 1631         addr2 = smp_tlb_addr2;
 1632         generation = smp_tlb_generation;        /* Overlap with serialization */
 1633         do {
 1634                 invlpg(addr);
 1635                 addr += PAGE_SIZE;
 1636         } while (addr < addr2);
 1637 
 1638         PCPU_SET(smp_tlb_done, generation);
 1639 }

Cache object: a7234c818b00afeb09d48a4580ff73b2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.