The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_smp.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  */
   27 
   28 /*
   29  * This module holds the global variables and machine independent functions
   30  * used for the kernel SMP support.
   31  */
   32 
   33 #include <sys/cdefs.h>
   34 __FBSDID("$FreeBSD$");
   35 
   36 #include <sys/param.h>
   37 #include <sys/systm.h>
   38 #include <sys/kernel.h>
   39 #include <sys/ktr.h>
   40 #include <sys/proc.h>
   41 #include <sys/bus.h>
   42 #include <sys/lock.h>
   43 #include <sys/malloc.h>
   44 #include <sys/mutex.h>
   45 #include <sys/pcpu.h>
   46 #include <sys/sched.h>
   47 #include <sys/smp.h>
   48 #include <sys/sysctl.h>
   49 
   50 #include <machine/cpu.h>
   51 #include <machine/smp.h>
   52 
   53 #include "opt_sched.h"
   54 
   55 #ifdef SMP
   56 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
   57 
   58 volatile cpuset_t stopped_cpus;
   59 volatile cpuset_t started_cpus;
   60 volatile cpuset_t suspended_cpus;
   61 cpuset_t hlt_cpus_mask;
   62 cpuset_t logical_cpus_mask;
   63 
   64 void (*cpustop_restartfunc)(void);
   65 #endif
   66 
   67 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
   68 
   69 /* This is used in modules that need to work in both SMP and UP. */
   70 cpuset_t all_cpus;
   71 
   72 int mp_ncpus;
   73 /* export this for libkvm consumers. */
   74 int mp_maxcpus = MAXCPU;
   75 
   76 volatile int smp_started;
   77 u_int mp_maxid;
   78 
   79 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
   80     "Kernel SMP");
   81 
   82 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
   83     "Max CPU ID.");
   84 
   85 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
   86     0, "Max number of CPUs that the system was compiled for.");
   87 
   88 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
   89     NULL, 0, sysctl_kern_smp_active, "I",
   90     "Indicates system is running in SMP mode");
   91 
   92 int smp_disabled = 0;   /* has smp been disabled? */
   93 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
   94     &smp_disabled, 0, "SMP has been disabled from the loader");
   95 
   96 int smp_cpus = 1;       /* how many cpu's running */
   97 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
   98     "Number of CPUs online");
   99 
  100 int smp_threads_per_core = 1;   /* how many SMT threads are running per core */
  101 SYSCTL_INT(_kern_smp, OID_AUTO, threads_per_core, CTLFLAG_RD|CTLFLAG_CAPRD,
  102     &smp_threads_per_core, 0, "Number of SMT threads online per core");
  103 
  104 int mp_ncores = -1;     /* how many physical cores running */
  105 SYSCTL_INT(_kern_smp, OID_AUTO, cores, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncores, 0,
  106     "Number of physical cores online");
  107 
  108 int smp_topology = 0;   /* Which topology we're using. */
  109 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
  110     "Topology override setting; 0 is default provided by hardware.");
  111 
  112 #ifdef SMP
  113 /* Enable forwarding of a signal to a process running on a different CPU */
  114 static int forward_signal_enabled = 1;
  115 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
  116            &forward_signal_enabled, 0,
  117            "Forwarding of a signal to a process on a different CPU");
  118 
  119 /* Variables needed for SMP rendezvous. */
  120 static volatile int smp_rv_ncpus;
  121 static void (*volatile smp_rv_setup_func)(void *arg);
  122 static void (*volatile smp_rv_action_func)(void *arg);
  123 static void (*volatile smp_rv_teardown_func)(void *arg);
  124 static void *volatile smp_rv_func_arg;
  125 static volatile int smp_rv_waiters[4];
  126 
  127 /* 
  128  * Shared mutex to restrict busywaits between smp_rendezvous() and
  129  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  130  * functions trigger at once and cause multiple CPUs to busywait with
  131  * interrupts disabled. 
  132  */
  133 struct mtx smp_ipi_mtx;
  134 
  135 /*
  136  * Let the MD SMP code initialize mp_maxid very early if it can.
  137  */
  138 static void
  139 mp_setmaxid(void *dummy)
  140 {
  141 
  142         cpu_mp_setmaxid();
  143 
  144         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
  145         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
  146             ("%s: one CPU but mp_maxid is not zero", __func__));
  147         KASSERT(mp_maxid >= mp_ncpus - 1,
  148             ("%s: counters out of sync: max %d, count %d", __func__,
  149                 mp_maxid, mp_ncpus));
  150 }
  151 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
  152 
  153 /*
  154  * Call the MD SMP initialization code.
  155  */
  156 static void
  157 mp_start(void *dummy)
  158 {
  159 
  160         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
  161 
  162         /* Probe for MP hardware. */
  163         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
  164                 mp_ncores = 1;
  165                 mp_ncpus = 1;
  166                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
  167                 return;
  168         }
  169 
  170         cpu_mp_start();
  171         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
  172             mp_ncpus);
  173 
  174         /* Provide a default for most architectures that don't have SMT/HTT. */
  175         if (mp_ncores < 0)
  176                 mp_ncores = mp_ncpus;
  177 
  178         cpu_mp_announce();
  179 }
  180 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
  181 
  182 void
  183 forward_signal(struct thread *td)
  184 {
  185         int id;
  186 
  187         /*
  188          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
  189          * this thread, so all we need to do is poke it if it is currently
  190          * executing so that it executes ast().
  191          */
  192         THREAD_LOCK_ASSERT(td, MA_OWNED);
  193         KASSERT(TD_IS_RUNNING(td),
  194             ("forward_signal: thread is not TDS_RUNNING"));
  195 
  196         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
  197 
  198         if (!smp_started || cold || panicstr)
  199                 return;
  200         if (!forward_signal_enabled)
  201                 return;
  202 
  203         /* No need to IPI ourself. */
  204         if (td == curthread)
  205                 return;
  206 
  207         id = td->td_oncpu;
  208         if (id == NOCPU)
  209                 return;
  210         ipi_cpu(id, IPI_AST);
  211 }
  212 
  213 /*
  214  * When called the executing CPU will send an IPI to all other CPUs
  215  *  requesting that they halt execution.
  216  *
  217  * Usually (but not necessarily) called with 'other_cpus' as its arg.
  218  *
  219  *  - Signals all CPUs in map to stop.
  220  *  - Waits for each to stop.
  221  *
  222  * Returns:
  223  *  -1: error
  224  *   0: NA
  225  *   1: ok
  226  *
  227  */
  228 #if defined(__amd64__) || defined(__i386__)
  229 #define X86     1
  230 #else
  231 #define X86     0
  232 #endif
  233 static int
  234 generic_stop_cpus(cpuset_t map, u_int type)
  235 {
  236 #ifdef KTR
  237         char cpusetbuf[CPUSETBUFSIZ];
  238 #endif
  239         static volatile u_int stopping_cpu = NOCPU;
  240         int i;
  241         volatile cpuset_t *cpus;
  242 
  243         KASSERT(
  244             type == IPI_STOP || type == IPI_STOP_HARD
  245 #if X86
  246             || type == IPI_SUSPEND
  247 #endif
  248             , ("%s: invalid stop type", __func__));
  249 
  250         if (!smp_started)
  251                 return (0);
  252 
  253         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
  254             cpusetobj_strprint(cpusetbuf, &map), type);
  255 
  256 #if X86
  257         /*
  258          * When suspending, ensure there are are no IPIs in progress.
  259          * IPIs that have been issued, but not yet delivered (e.g.
  260          * not pending on a vCPU when running under virtualization)
  261          * will be lost, violating FreeBSD's assumption of reliable
  262          * IPI delivery.
  263          */
  264         if (type == IPI_SUSPEND)
  265                 mtx_lock_spin(&smp_ipi_mtx);
  266 #endif
  267 
  268 #if X86
  269         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
  270 #endif
  271         if (stopping_cpu != PCPU_GET(cpuid))
  272                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
  273                     PCPU_GET(cpuid)) == 0)
  274                         while (stopping_cpu != NOCPU)
  275                                 cpu_spinwait(); /* spin */
  276 
  277         /* send the stop IPI to all CPUs in map */
  278         ipi_selected(map, type);
  279 #if X86
  280         }
  281 #endif
  282 
  283 #if X86
  284         if (type == IPI_SUSPEND)
  285                 cpus = &suspended_cpus;
  286         else
  287 #endif
  288                 cpus = &stopped_cpus;
  289 
  290         i = 0;
  291         while (!CPU_SUBSET(cpus, &map)) {
  292                 /* spin */
  293                 cpu_spinwait();
  294                 i++;
  295                 if (i == 100000000) {
  296                         printf("timeout stopping cpus\n");
  297                         break;
  298                 }
  299         }
  300 
  301 #if X86
  302         if (type == IPI_SUSPEND)
  303                 mtx_unlock_spin(&smp_ipi_mtx);
  304 #endif
  305 
  306         stopping_cpu = NOCPU;
  307         return (1);
  308 }
  309 
  310 int
  311 stop_cpus(cpuset_t map)
  312 {
  313 
  314         return (generic_stop_cpus(map, IPI_STOP));
  315 }
  316 
  317 int
  318 stop_cpus_hard(cpuset_t map)
  319 {
  320 
  321         return (generic_stop_cpus(map, IPI_STOP_HARD));
  322 }
  323 
  324 #if X86
  325 int
  326 suspend_cpus(cpuset_t map)
  327 {
  328 
  329         return (generic_stop_cpus(map, IPI_SUSPEND));
  330 }
  331 #endif
  332 
  333 /*
  334  * Called by a CPU to restart stopped CPUs. 
  335  *
  336  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
  337  *
  338  *  - Signals all CPUs in map to restart.
  339  *  - Waits for each to restart.
  340  *
  341  * Returns:
  342  *  -1: error
  343  *   0: NA
  344  *   1: ok
  345  */
  346 static int
  347 generic_restart_cpus(cpuset_t map, u_int type)
  348 {
  349 #ifdef KTR
  350         char cpusetbuf[CPUSETBUFSIZ];
  351 #endif
  352         volatile cpuset_t *cpus;
  353 
  354         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
  355 #if X86
  356             || type == IPI_SUSPEND
  357 #endif
  358             , ("%s: invalid stop type", __func__));
  359 
  360         if (!smp_started)
  361                 return (0);
  362 
  363         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
  364 
  365 #if X86
  366         if (type == IPI_SUSPEND)
  367                 cpus = &resuming_cpus;
  368         else
  369 #endif
  370                 cpus = &stopped_cpus;
  371 
  372         /* signal other cpus to restart */
  373 #if X86
  374         if (type == IPI_SUSPEND)
  375                 CPU_COPY_STORE_REL(&map, &toresume_cpus);
  376         else
  377 #endif
  378                 CPU_COPY_STORE_REL(&map, &started_cpus);
  379 
  380 #if X86
  381         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
  382 #endif
  383         /* wait for each to clear its bit */
  384         while (CPU_OVERLAP(cpus, &map))
  385                 cpu_spinwait();
  386 #if X86
  387         }
  388 #endif
  389 
  390         return (1);
  391 }
  392 
  393 int
  394 restart_cpus(cpuset_t map)
  395 {
  396 
  397         return (generic_restart_cpus(map, IPI_STOP));
  398 }
  399 
  400 #if X86
  401 int
  402 resume_cpus(cpuset_t map)
  403 {
  404 
  405         return (generic_restart_cpus(map, IPI_SUSPEND));
  406 }
  407 #endif
  408 #undef X86
  409 
  410 /*
  411  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
  412  * (if specified), rendezvous, execute the action function (if specified),
  413  * rendezvous again, execute the teardown function (if specified), and then
  414  * resume.
  415  *
  416  * Note that the supplied external functions _must_ be reentrant and aware
  417  * that they are running in parallel and in an unknown lock context.
  418  */
  419 void
  420 smp_rendezvous_action(void)
  421 {
  422         struct thread *td;
  423         void *local_func_arg;
  424         void (*local_setup_func)(void*);
  425         void (*local_action_func)(void*);
  426         void (*local_teardown_func)(void*);
  427 #ifdef INVARIANTS
  428         int owepreempt;
  429 #endif
  430 
  431         /* Ensure we have up-to-date values. */
  432         atomic_add_acq_int(&smp_rv_waiters[0], 1);
  433         while (smp_rv_waiters[0] < smp_rv_ncpus)
  434                 cpu_spinwait();
  435 
  436         /* Fetch rendezvous parameters after acquire barrier. */
  437         local_func_arg = smp_rv_func_arg;
  438         local_setup_func = smp_rv_setup_func;
  439         local_action_func = smp_rv_action_func;
  440         local_teardown_func = smp_rv_teardown_func;
  441 
  442         /*
  443          * Use a nested critical section to prevent any preemptions
  444          * from occurring during a rendezvous action routine.
  445          * Specifically, if a rendezvous handler is invoked via an IPI
  446          * and the interrupted thread was in the critical_exit()
  447          * function after setting td_critnest to 0 but before
  448          * performing a deferred preemption, this routine can be
  449          * invoked with td_critnest set to 0 and td_owepreempt true.
  450          * In that case, a critical_exit() during the rendezvous
  451          * action would trigger a preemption which is not permitted in
  452          * a rendezvous action.  To fix this, wrap all of the
  453          * rendezvous action handlers in a critical section.  We
  454          * cannot use a regular critical section however as having
  455          * critical_exit() preempt from this routine would also be
  456          * problematic (the preemption must not occur before the IPI
  457          * has been acknowledged via an EOI).  Instead, we
  458          * intentionally ignore td_owepreempt when leaving the
  459          * critical section.  This should be harmless because we do
  460          * not permit rendezvous action routines to schedule threads,
  461          * and thus td_owepreempt should never transition from 0 to 1
  462          * during this routine.
  463          */
  464         td = curthread;
  465         td->td_critnest++;
  466 #ifdef INVARIANTS
  467         owepreempt = td->td_owepreempt;
  468 #endif
  469         
  470         /*
  471          * If requested, run a setup function before the main action
  472          * function.  Ensure all CPUs have completed the setup
  473          * function before moving on to the action function.
  474          */
  475         if (local_setup_func != smp_no_rendezvous_barrier) {
  476                 if (smp_rv_setup_func != NULL)
  477                         smp_rv_setup_func(smp_rv_func_arg);
  478                 atomic_add_int(&smp_rv_waiters[1], 1);
  479                 while (smp_rv_waiters[1] < smp_rv_ncpus)
  480                         cpu_spinwait();
  481         }
  482 
  483         if (local_action_func != NULL)
  484                 local_action_func(local_func_arg);
  485 
  486         if (local_teardown_func != smp_no_rendezvous_barrier) {
  487                 /*
  488                  * Signal that the main action has been completed.  If a
  489                  * full exit rendezvous is requested, then all CPUs will
  490                  * wait here until all CPUs have finished the main action.
  491                  */
  492                 atomic_add_int(&smp_rv_waiters[2], 1);
  493                 while (smp_rv_waiters[2] < smp_rv_ncpus)
  494                         cpu_spinwait();
  495 
  496                 if (local_teardown_func != NULL)
  497                         local_teardown_func(local_func_arg);
  498         }
  499 
  500         /*
  501          * Signal that the rendezvous is fully completed by this CPU.
  502          * This means that no member of smp_rv_* pseudo-structure will be
  503          * accessed by this target CPU after this point; in particular,
  504          * memory pointed by smp_rv_func_arg.
  505          *
  506          * The release semantic ensures that all accesses performed by
  507          * the current CPU are visible when smp_rendezvous_cpus()
  508          * returns, by synchronizing with the
  509          * atomic_load_acq_int(&smp_rv_waiters[3]).
  510          */
  511         atomic_add_rel_int(&smp_rv_waiters[3], 1);
  512 
  513         td->td_critnest--;
  514         KASSERT(owepreempt == td->td_owepreempt,
  515             ("rendezvous action changed td_owepreempt"));
  516 }
  517 
  518 void
  519 smp_rendezvous_cpus(cpuset_t map,
  520         void (* setup_func)(void *), 
  521         void (* action_func)(void *),
  522         void (* teardown_func)(void *),
  523         void *arg)
  524 {
  525         int curcpumap, i, ncpus = 0;
  526 
  527         /* Look comments in the !SMP case. */
  528         if (!smp_started) {
  529                 spinlock_enter();
  530                 if (setup_func != NULL)
  531                         setup_func(arg);
  532                 if (action_func != NULL)
  533                         action_func(arg);
  534                 if (teardown_func != NULL)
  535                         teardown_func(arg);
  536                 spinlock_exit();
  537                 return;
  538         }
  539 
  540         CPU_FOREACH(i) {
  541                 if (CPU_ISSET(i, &map))
  542                         ncpus++;
  543         }
  544         if (ncpus == 0)
  545                 panic("ncpus is 0 with non-zero map");
  546 
  547         mtx_lock_spin(&smp_ipi_mtx);
  548 
  549         /* Pass rendezvous parameters via global variables. */
  550         smp_rv_ncpus = ncpus;
  551         smp_rv_setup_func = setup_func;
  552         smp_rv_action_func = action_func;
  553         smp_rv_teardown_func = teardown_func;
  554         smp_rv_func_arg = arg;
  555         smp_rv_waiters[1] = 0;
  556         smp_rv_waiters[2] = 0;
  557         smp_rv_waiters[3] = 0;
  558         atomic_store_rel_int(&smp_rv_waiters[0], 0);
  559 
  560         /*
  561          * Signal other processors, which will enter the IPI with
  562          * interrupts off.
  563          */
  564         curcpumap = CPU_ISSET(curcpu, &map);
  565         CPU_CLR(curcpu, &map);
  566         ipi_selected(map, IPI_RENDEZVOUS);
  567 
  568         /* Check if the current CPU is in the map */
  569         if (curcpumap != 0)
  570                 smp_rendezvous_action();
  571 
  572         /*
  573          * Ensure that the master CPU waits for all the other
  574          * CPUs to finish the rendezvous, so that smp_rv_*
  575          * pseudo-structure and the arg are guaranteed to not
  576          * be in use.
  577          *
  578          * Load acquire synchronizes with the release add in
  579          * smp_rendezvous_action(), which ensures that our caller sees
  580          * all memory actions done by the called functions on other
  581          * CPUs.
  582          */
  583         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
  584                 cpu_spinwait();
  585 
  586         mtx_unlock_spin(&smp_ipi_mtx);
  587 }
  588 
  589 void
  590 smp_rendezvous(void (* setup_func)(void *), 
  591                void (* action_func)(void *),
  592                void (* teardown_func)(void *),
  593                void *arg)
  594 {
  595         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
  596 }
  597 
  598 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
  599 
  600 struct cpu_group *
  601 smp_topo(void)
  602 {
  603         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
  604         struct cpu_group *top;
  605 
  606         /*
  607          * Check for a fake topology request for debugging purposes.
  608          */
  609         switch (smp_topology) {
  610         case 1:
  611                 /* Dual core with no sharing.  */
  612                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
  613                 break;
  614         case 2:
  615                 /* No topology, all cpus are equal. */
  616                 top = smp_topo_none();
  617                 break;
  618         case 3:
  619                 /* Dual core with shared L2.  */
  620                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
  621                 break;
  622         case 4:
  623                 /* quad core, shared l3 among each package, private l2.  */
  624                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
  625                 break;
  626         case 5:
  627                 /* quad core,  2 dualcore parts on each package share l2.  */
  628                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
  629                 break;
  630         case 6:
  631                 /* Single-core 2xHTT */
  632                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
  633                 break;
  634         case 7:
  635                 /* quad core with a shared l3, 8 threads sharing L2.  */
  636                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
  637                     CG_FLAG_SMT);
  638                 break;
  639         default:
  640                 /* Default, ask the system what it wants. */
  641                 top = cpu_topo();
  642                 break;
  643         }
  644         /*
  645          * Verify the returned topology.
  646          */
  647         if (top->cg_count != mp_ncpus)
  648                 panic("Built bad topology at %p.  CPU count %d != %d",
  649                     top, top->cg_count, mp_ncpus);
  650         if (CPU_CMP(&top->cg_mask, &all_cpus))
  651                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
  652                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
  653                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
  654 
  655         /*
  656          * Collapse nonsense levels that may be created out of convenience by
  657          * the MD layers.  They cause extra work in the search functions.
  658          */
  659         while (top->cg_children == 1) {
  660                 top = &top->cg_child[0];
  661                 top->cg_parent = NULL;
  662         }
  663         return (top);
  664 }
  665 
  666 struct cpu_group *
  667 smp_topo_alloc(u_int count)
  668 {
  669         static u_int index;
  670         u_int curr;
  671 
  672         curr = index;
  673         index += count;
  674         return (&group[curr]);
  675 }
  676 
  677 struct cpu_group *
  678 smp_topo_none(void)
  679 {
  680         struct cpu_group *top;
  681 
  682         top = &group[0];
  683         top->cg_parent = NULL;
  684         top->cg_child = NULL;
  685         top->cg_mask = all_cpus;
  686         top->cg_count = mp_ncpus;
  687         top->cg_children = 0;
  688         top->cg_level = CG_SHARE_NONE;
  689         top->cg_flags = 0;
  690         
  691         return (top);
  692 }
  693 
  694 static int
  695 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
  696     int count, int flags, int start)
  697 {
  698         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
  699         cpuset_t mask;
  700         int i;
  701 
  702         CPU_ZERO(&mask);
  703         for (i = 0; i < count; i++, start++)
  704                 CPU_SET(start, &mask);
  705         child->cg_parent = parent;
  706         child->cg_child = NULL;
  707         child->cg_children = 0;
  708         child->cg_level = share;
  709         child->cg_count = count;
  710         child->cg_flags = flags;
  711         child->cg_mask = mask;
  712         parent->cg_children++;
  713         for (; parent != NULL; parent = parent->cg_parent) {
  714                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
  715                         panic("Duplicate children in %p.  mask (%s) child (%s)",
  716                             parent,
  717                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
  718                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
  719                 CPU_OR(&parent->cg_mask, &child->cg_mask);
  720                 parent->cg_count += child->cg_count;
  721         }
  722 
  723         return (start);
  724 }
  725 
  726 struct cpu_group *
  727 smp_topo_1level(int share, int count, int flags)
  728 {
  729         struct cpu_group *child;
  730         struct cpu_group *top;
  731         int packages;
  732         int cpu;
  733         int i;
  734 
  735         cpu = 0;
  736         top = &group[0];
  737         packages = mp_ncpus / count;
  738         top->cg_child = child = &group[1];
  739         top->cg_level = CG_SHARE_NONE;
  740         for (i = 0; i < packages; i++, child++)
  741                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
  742         return (top);
  743 }
  744 
  745 struct cpu_group *
  746 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
  747     int l1flags)
  748 {
  749         struct cpu_group *top;
  750         struct cpu_group *l1g;
  751         struct cpu_group *l2g;
  752         int cpu;
  753         int i;
  754         int j;
  755 
  756         cpu = 0;
  757         top = &group[0];
  758         l2g = &group[1];
  759         top->cg_child = l2g;
  760         top->cg_level = CG_SHARE_NONE;
  761         top->cg_children = mp_ncpus / (l2count * l1count);
  762         l1g = l2g + top->cg_children;
  763         for (i = 0; i < top->cg_children; i++, l2g++) {
  764                 l2g->cg_parent = top;
  765                 l2g->cg_child = l1g;
  766                 l2g->cg_level = l2share;
  767                 for (j = 0; j < l2count; j++, l1g++)
  768                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
  769                             l1flags, cpu);
  770         }
  771         return (top);
  772 }
  773 
  774 
  775 struct cpu_group *
  776 smp_topo_find(struct cpu_group *top, int cpu)
  777 {
  778         struct cpu_group *cg;
  779         cpuset_t mask;
  780         int children;
  781         int i;
  782 
  783         CPU_SETOF(cpu, &mask);
  784         cg = top;
  785         for (;;) {
  786                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
  787                         return (NULL);
  788                 if (cg->cg_children == 0)
  789                         return (cg);
  790                 children = cg->cg_children;
  791                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
  792                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
  793                                 break;
  794         }
  795         return (NULL);
  796 }
  797 #else /* !SMP */
  798 
  799 void
  800 smp_rendezvous_cpus(cpuset_t map,
  801         void (*setup_func)(void *), 
  802         void (*action_func)(void *),
  803         void (*teardown_func)(void *),
  804         void *arg)
  805 {
  806         /*
  807          * In the !SMP case we just need to ensure the same initial conditions
  808          * as the SMP case.
  809          */
  810         spinlock_enter();
  811         if (setup_func != NULL)
  812                 setup_func(arg);
  813         if (action_func != NULL)
  814                 action_func(arg);
  815         if (teardown_func != NULL)
  816                 teardown_func(arg);
  817         spinlock_exit();
  818 }
  819 
  820 void
  821 smp_rendezvous(void (*setup_func)(void *), 
  822                void (*action_func)(void *),
  823                void (*teardown_func)(void *),
  824                void *arg)
  825 {
  826 
  827         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
  828             arg);
  829 }
  830 
  831 /*
  832  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
  833  * APIs will still work using this dummy support.
  834  */
  835 static void
  836 mp_setvariables_for_up(void *dummy)
  837 {
  838         mp_ncpus = 1;
  839         mp_ncores = 1;
  840         mp_maxid = PCPU_GET(cpuid);
  841         CPU_SETOF(mp_maxid, &all_cpus);
  842         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
  843 }
  844 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
  845     mp_setvariables_for_up, NULL);
  846 #endif /* SMP */
  847 
  848 void
  849 smp_no_rendezvous_barrier(void *dummy)
  850 {
  851 #ifdef SMP
  852         KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
  853 #endif
  854 }
  855 
  856 /*
  857  * Wait for specified idle threads to switch once.  This ensures that even
  858  * preempted threads have cycled through the switch function once,
  859  * exiting their codepaths.  This allows us to change global pointers
  860  * with no other synchronization.
  861  */
  862 int
  863 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
  864 {
  865         struct pcpu *pcpu;
  866         u_int gen[MAXCPU];
  867         int error;
  868         int cpu;
  869 
  870         error = 0;
  871         for (cpu = 0; cpu <= mp_maxid; cpu++) {
  872                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
  873                         continue;
  874                 pcpu = pcpu_find(cpu);
  875                 gen[cpu] = pcpu->pc_idlethread->td_generation;
  876         }
  877         for (cpu = 0; cpu <= mp_maxid; cpu++) {
  878                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
  879                         continue;
  880                 pcpu = pcpu_find(cpu);
  881                 thread_lock(curthread);
  882                 sched_bind(curthread, cpu);
  883                 thread_unlock(curthread);
  884                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
  885                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
  886                         if (error != EWOULDBLOCK)
  887                                 goto out;
  888                         error = 0;
  889                 }
  890         }
  891 out:
  892         thread_lock(curthread);
  893         sched_unbind(curthread);
  894         thread_unlock(curthread);
  895 
  896         return (error);
  897 }
  898 
  899 int
  900 quiesce_all_cpus(const char *wmesg, int prio)
  901 {
  902 
  903         return quiesce_cpus(all_cpus, wmesg, prio);
  904 }
  905 
  906 /* Extra care is taken with this sysctl because the data type is volatile */
  907 static int
  908 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
  909 {
  910         int error, active;
  911 
  912         active = smp_started;
  913         error = SYSCTL_OUT(req, &active, sizeof(active));
  914         return (error);
  915 }
  916 
  917 
  918 #ifdef SMP
  919 void
  920 topo_init_node(struct topo_node *node)
  921 {
  922 
  923         bzero(node, sizeof(*node));
  924         TAILQ_INIT(&node->children);
  925 }
  926 
  927 void
  928 topo_init_root(struct topo_node *root)
  929 {
  930 
  931         topo_init_node(root);
  932         root->type = TOPO_TYPE_SYSTEM;
  933 }
  934 
  935 /*
  936  * Add a child node with the given ID under the given parent.
  937  * Do nothing if there is already a child with that ID.
  938  */
  939 struct topo_node *
  940 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
  941     topo_node_type type, uintptr_t subtype)
  942 {
  943         struct topo_node *node;
  944 
  945         TAILQ_FOREACH_REVERSE(node, &parent->children,
  946             topo_children, siblings) {
  947                 if (node->hwid == hwid
  948                     && node->type == type && node->subtype == subtype) {
  949                         return (node);
  950                 }
  951         }
  952 
  953         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
  954         topo_init_node(node);
  955         node->parent = parent;
  956         node->hwid = hwid;
  957         node->type = type;
  958         node->subtype = subtype;
  959         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
  960         parent->nchildren++;
  961 
  962         return (node);
  963 }
  964 
  965 /*
  966  * Find a child node with the given ID under the given parent.
  967  */
  968 struct topo_node *
  969 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
  970     topo_node_type type, uintptr_t subtype)
  971 {
  972 
  973         struct topo_node *node;
  974 
  975         TAILQ_FOREACH(node, &parent->children, siblings) {
  976                 if (node->hwid == hwid
  977                     && node->type == type && node->subtype == subtype) {
  978                         return (node);
  979                 }
  980         }
  981 
  982         return (NULL);
  983 }
  984 
  985 /*
  986  * Given a node change the order of its parent's child nodes such
  987  * that the node becomes the firt child while preserving the cyclic
  988  * order of the children.  In other words, the given node is promoted
  989  * by rotation.
  990  */
  991 void
  992 topo_promote_child(struct topo_node *child)
  993 {
  994         struct topo_node *next;
  995         struct topo_node *node;
  996         struct topo_node *parent;
  997 
  998         parent = child->parent;
  999         next = TAILQ_NEXT(child, siblings);
 1000         TAILQ_REMOVE(&parent->children, child, siblings);
 1001         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
 1002 
 1003         while (next != NULL) {
 1004                 node = next;
 1005                 next = TAILQ_NEXT(node, siblings);
 1006                 TAILQ_REMOVE(&parent->children, node, siblings);
 1007                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
 1008                 child = node;
 1009         }
 1010 }
 1011 
 1012 /*
 1013  * Iterate to the next node in the depth-first search (traversal) of
 1014  * the topology tree.
 1015  */
 1016 struct topo_node *
 1017 topo_next_node(struct topo_node *top, struct topo_node *node)
 1018 {
 1019         struct topo_node *next;
 1020 
 1021         if ((next = TAILQ_FIRST(&node->children)) != NULL)
 1022                 return (next);
 1023 
 1024         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1025                 return (next);
 1026 
 1027         while (node != top && (node = node->parent) != top)
 1028                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1029                         return (next);
 1030 
 1031         return (NULL);
 1032 }
 1033 
 1034 /*
 1035  * Iterate to the next node in the depth-first search of the topology tree,
 1036  * but without descending below the current node.
 1037  */
 1038 struct topo_node *
 1039 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
 1040 {
 1041         struct topo_node *next;
 1042 
 1043         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1044                 return (next);
 1045 
 1046         while (node != top && (node = node->parent) != top)
 1047                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1048                         return (next);
 1049 
 1050         return (NULL);
 1051 }
 1052 
 1053 /*
 1054  * Assign the given ID to the given topology node that represents a logical
 1055  * processor.
 1056  */
 1057 void
 1058 topo_set_pu_id(struct topo_node *node, cpuid_t id)
 1059 {
 1060 
 1061         KASSERT(node->type == TOPO_TYPE_PU,
 1062             ("topo_set_pu_id: wrong node type: %u", node->type));
 1063         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
 1064             ("topo_set_pu_id: cpuset already not empty"));
 1065         node->id = id;
 1066         CPU_SET(id, &node->cpuset);
 1067         node->cpu_count = 1;
 1068         node->subtype = 1;
 1069 
 1070         while ((node = node->parent) != NULL) {
 1071                 KASSERT(!CPU_ISSET(id, &node->cpuset),
 1072                     ("logical ID %u is already set in node %p", id, node));
 1073                 CPU_SET(id, &node->cpuset);
 1074                 node->cpu_count++;
 1075         }
 1076 }
 1077 
 1078 static struct topology_spec {
 1079         topo_node_type  type;
 1080         bool            match_subtype;
 1081         uintptr_t       subtype;
 1082 } topology_level_table[TOPO_LEVEL_COUNT] = {
 1083         [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
 1084         [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
 1085         [TOPO_LEVEL_CACHEGROUP] = {
 1086                 .type = TOPO_TYPE_CACHE,
 1087                 .match_subtype = true,
 1088                 .subtype = CG_SHARE_L3,
 1089         },
 1090         [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
 1091         [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
 1092 };
 1093 
 1094 static bool
 1095 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
 1096     struct topo_analysis *results)
 1097 {
 1098         struct topology_spec *spec;
 1099         struct topo_node *node;
 1100         int count;
 1101 
 1102         if (level >= TOPO_LEVEL_COUNT)
 1103                 return (true);
 1104 
 1105         spec = &topology_level_table[level];
 1106         count = 0;
 1107         node = topo_next_node(root, root);
 1108 
 1109         while (node != NULL) {
 1110                 if (node->type != spec->type ||
 1111                     (spec->match_subtype && node->subtype != spec->subtype)) {
 1112                         node = topo_next_node(root, node);
 1113                         continue;
 1114                 }
 1115                 if (!all && CPU_EMPTY(&node->cpuset)) {
 1116                         node = topo_next_nonchild_node(root, node);
 1117                         continue;
 1118                 }
 1119 
 1120                 count++;
 1121 
 1122                 if (!topo_analyze_table(node, all, level + 1, results))
 1123                         return (false);
 1124 
 1125                 node = topo_next_nonchild_node(root, node);
 1126         }
 1127 
 1128         /* No explicit subgroups is essentially one subgroup. */
 1129         if (count == 0) {
 1130                 count = 1;
 1131 
 1132                 if (!topo_analyze_table(root, all, level + 1, results))
 1133                         return (false);
 1134         }
 1135 
 1136         if (results->entities[level] == -1)
 1137                 results->entities[level] = count;
 1138         else if (results->entities[level] != count)
 1139                 return (false);
 1140 
 1141         return (true);
 1142 }
 1143 
 1144 /*
 1145  * Check if the topology is uniform, that is, each package has the same number
 1146  * of cores in it and each core has the same number of threads (logical
 1147  * processors) in it.  If so, calculate the number of packages, the number of
 1148  * groups per package, the number of cachegroups per group, and the number of
 1149  * logical processors per cachegroup.  'all' parameter tells whether to include
 1150  * administratively disabled logical processors into the analysis.
 1151  */
 1152 int
 1153 topo_analyze(struct topo_node *topo_root, int all,
 1154     struct topo_analysis *results)
 1155 {
 1156 
 1157         results->entities[TOPO_LEVEL_PKG] = -1;
 1158         results->entities[TOPO_LEVEL_CORE] = -1;
 1159         results->entities[TOPO_LEVEL_THREAD] = -1;
 1160         results->entities[TOPO_LEVEL_GROUP] = -1;
 1161         results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
 1162 
 1163         if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
 1164                 return (0);
 1165 
 1166         KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
 1167                 ("bug in topology or analysis"));
 1168 
 1169         return (1);
 1170 }
 1171 
 1172 #endif /* SMP */
 1173 

Cache object: 9c2d0bd2873ebc677850f0bb6ac6014a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.