The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_smp.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 /*
   28  * This module holds the global variables and machine independent functions
   29  * used for the kernel SMP support.
   30  */
   31 
   32 #include <sys/cdefs.h>
   33 __FBSDID("$FreeBSD: releng/11.0/sys/kern/subr_smp.c 297710 2016-04-08 11:59:11Z avg $");
   34 
   35 #include <sys/param.h>
   36 #include <sys/systm.h>
   37 #include <sys/kernel.h>
   38 #include <sys/ktr.h>
   39 #include <sys/proc.h>
   40 #include <sys/bus.h>
   41 #include <sys/lock.h>
   42 #include <sys/malloc.h>
   43 #include <sys/mutex.h>
   44 #include <sys/pcpu.h>
   45 #include <sys/sched.h>
   46 #include <sys/smp.h>
   47 #include <sys/sysctl.h>
   48 
   49 #include <machine/cpu.h>
   50 #include <machine/smp.h>
   51 
   52 #include "opt_sched.h"
   53 
   54 #ifdef SMP
   55 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
   56 
   57 volatile cpuset_t stopped_cpus;
   58 volatile cpuset_t started_cpus;
   59 volatile cpuset_t suspended_cpus;
   60 cpuset_t hlt_cpus_mask;
   61 cpuset_t logical_cpus_mask;
   62 
   63 void (*cpustop_restartfunc)(void);
   64 #endif
   65 
   66 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
   67 
   68 /* This is used in modules that need to work in both SMP and UP. */
   69 cpuset_t all_cpus;
   70 
   71 int mp_ncpus;
   72 /* export this for libkvm consumers. */
   73 int mp_maxcpus = MAXCPU;
   74 
   75 volatile int smp_started;
   76 u_int mp_maxid;
   77 
   78 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
   79     "Kernel SMP");
   80 
   81 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
   82     "Max CPU ID.");
   83 
   84 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
   85     0, "Max number of CPUs that the system was compiled for.");
   86 
   87 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD | CTLTYPE_INT, NULL, 0,
   88     sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode");
   89 
   90 int smp_disabled = 0;   /* has smp been disabled? */
   91 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
   92     &smp_disabled, 0, "SMP has been disabled from the loader");
   93 
   94 int smp_cpus = 1;       /* how many cpu's running */
   95 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
   96     "Number of CPUs online");
   97 
   98 int smp_topology = 0;   /* Which topology we're using. */
   99 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
  100     "Topology override setting; 0 is default provided by hardware.");
  101 
  102 #ifdef SMP
  103 /* Enable forwarding of a signal to a process running on a different CPU */
  104 static int forward_signal_enabled = 1;
  105 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
  106            &forward_signal_enabled, 0,
  107            "Forwarding of a signal to a process on a different CPU");
  108 
  109 /* Variables needed for SMP rendezvous. */
  110 static volatile int smp_rv_ncpus;
  111 static void (*volatile smp_rv_setup_func)(void *arg);
  112 static void (*volatile smp_rv_action_func)(void *arg);
  113 static void (*volatile smp_rv_teardown_func)(void *arg);
  114 static void *volatile smp_rv_func_arg;
  115 static volatile int smp_rv_waiters[4];
  116 
  117 /* 
  118  * Shared mutex to restrict busywaits between smp_rendezvous() and
  119  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  120  * functions trigger at once and cause multiple CPUs to busywait with
  121  * interrupts disabled. 
  122  */
  123 struct mtx smp_ipi_mtx;
  124 
  125 /*
  126  * Let the MD SMP code initialize mp_maxid very early if it can.
  127  */
  128 static void
  129 mp_setmaxid(void *dummy)
  130 {
  131 
  132         cpu_mp_setmaxid();
  133 
  134         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
  135         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
  136             ("%s: one CPU but mp_maxid is not zero", __func__));
  137         KASSERT(mp_maxid >= mp_ncpus - 1,
  138             ("%s: counters out of sync: max %d, count %d", __func__,
  139                 mp_maxid, mp_ncpus));
  140 }
  141 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
  142 
  143 /*
  144  * Call the MD SMP initialization code.
  145  */
  146 static void
  147 mp_start(void *dummy)
  148 {
  149 
  150         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
  151 
  152         /* Probe for MP hardware. */
  153         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
  154                 mp_ncpus = 1;
  155                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
  156                 return;
  157         }
  158 
  159         cpu_mp_start();
  160         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
  161             mp_ncpus);
  162         cpu_mp_announce();
  163 }
  164 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
  165 
  166 void
  167 forward_signal(struct thread *td)
  168 {
  169         int id;
  170 
  171         /*
  172          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
  173          * this thread, so all we need to do is poke it if it is currently
  174          * executing so that it executes ast().
  175          */
  176         THREAD_LOCK_ASSERT(td, MA_OWNED);
  177         KASSERT(TD_IS_RUNNING(td),
  178             ("forward_signal: thread is not TDS_RUNNING"));
  179 
  180         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
  181 
  182         if (!smp_started || cold || panicstr)
  183                 return;
  184         if (!forward_signal_enabled)
  185                 return;
  186 
  187         /* No need to IPI ourself. */
  188         if (td == curthread)
  189                 return;
  190 
  191         id = td->td_oncpu;
  192         if (id == NOCPU)
  193                 return;
  194         ipi_cpu(id, IPI_AST);
  195 }
  196 
  197 /*
  198  * When called the executing CPU will send an IPI to all other CPUs
  199  *  requesting that they halt execution.
  200  *
  201  * Usually (but not necessarily) called with 'other_cpus' as its arg.
  202  *
  203  *  - Signals all CPUs in map to stop.
  204  *  - Waits for each to stop.
  205  *
  206  * Returns:
  207  *  -1: error
  208  *   0: NA
  209  *   1: ok
  210  *
  211  */
  212 static int
  213 generic_stop_cpus(cpuset_t map, u_int type)
  214 {
  215 #ifdef KTR
  216         char cpusetbuf[CPUSETBUFSIZ];
  217 #endif
  218         static volatile u_int stopping_cpu = NOCPU;
  219         int i;
  220         volatile cpuset_t *cpus;
  221 
  222         KASSERT(
  223 #if defined(__amd64__) || defined(__i386__)
  224             type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
  225 #else
  226             type == IPI_STOP || type == IPI_STOP_HARD,
  227 #endif
  228             ("%s: invalid stop type", __func__));
  229 
  230         if (!smp_started)
  231                 return (0);
  232 
  233         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
  234             cpusetobj_strprint(cpusetbuf, &map), type);
  235 
  236 #if defined(__amd64__) || defined(__i386__)
  237         /*
  238          * When suspending, ensure there are are no IPIs in progress.
  239          * IPIs that have been issued, but not yet delivered (e.g.
  240          * not pending on a vCPU when running under virtualization)
  241          * will be lost, violating FreeBSD's assumption of reliable
  242          * IPI delivery.
  243          */
  244         if (type == IPI_SUSPEND)
  245                 mtx_lock_spin(&smp_ipi_mtx);
  246 #endif
  247 
  248         if (stopping_cpu != PCPU_GET(cpuid))
  249                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
  250                     PCPU_GET(cpuid)) == 0)
  251                         while (stopping_cpu != NOCPU)
  252                                 cpu_spinwait(); /* spin */
  253 
  254         /* send the stop IPI to all CPUs in map */
  255         ipi_selected(map, type);
  256 
  257 #if defined(__amd64__) || defined(__i386__)
  258         if (type == IPI_SUSPEND)
  259                 cpus = &suspended_cpus;
  260         else
  261 #endif
  262                 cpus = &stopped_cpus;
  263 
  264         i = 0;
  265         while (!CPU_SUBSET(cpus, &map)) {
  266                 /* spin */
  267                 cpu_spinwait();
  268                 i++;
  269                 if (i == 100000000) {
  270                         printf("timeout stopping cpus\n");
  271                         break;
  272                 }
  273         }
  274 
  275 #if defined(__amd64__) || defined(__i386__)
  276         if (type == IPI_SUSPEND)
  277                 mtx_unlock_spin(&smp_ipi_mtx);
  278 #endif
  279 
  280         stopping_cpu = NOCPU;
  281         return (1);
  282 }
  283 
  284 int
  285 stop_cpus(cpuset_t map)
  286 {
  287 
  288         return (generic_stop_cpus(map, IPI_STOP));
  289 }
  290 
  291 int
  292 stop_cpus_hard(cpuset_t map)
  293 {
  294 
  295         return (generic_stop_cpus(map, IPI_STOP_HARD));
  296 }
  297 
  298 #if defined(__amd64__) || defined(__i386__)
  299 int
  300 suspend_cpus(cpuset_t map)
  301 {
  302 
  303         return (generic_stop_cpus(map, IPI_SUSPEND));
  304 }
  305 #endif
  306 
  307 /*
  308  * Called by a CPU to restart stopped CPUs. 
  309  *
  310  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
  311  *
  312  *  - Signals all CPUs in map to restart.
  313  *  - Waits for each to restart.
  314  *
  315  * Returns:
  316  *  -1: error
  317  *   0: NA
  318  *   1: ok
  319  */
  320 static int
  321 generic_restart_cpus(cpuset_t map, u_int type)
  322 {
  323 #ifdef KTR
  324         char cpusetbuf[CPUSETBUFSIZ];
  325 #endif
  326         volatile cpuset_t *cpus;
  327 
  328         KASSERT(
  329 #if defined(__amd64__) || defined(__i386__)
  330             type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
  331 #else
  332             type == IPI_STOP || type == IPI_STOP_HARD,
  333 #endif
  334             ("%s: invalid stop type", __func__));
  335 
  336         if (!smp_started)
  337                 return 0;
  338 
  339         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
  340 
  341 #if defined(__amd64__) || defined(__i386__)
  342         if (type == IPI_SUSPEND)
  343                 cpus = &suspended_cpus;
  344         else
  345 #endif
  346                 cpus = &stopped_cpus;
  347 
  348         /* signal other cpus to restart */
  349         CPU_COPY_STORE_REL(&map, &started_cpus);
  350 
  351         /* wait for each to clear its bit */
  352         while (CPU_OVERLAP(cpus, &map))
  353                 cpu_spinwait();
  354 
  355         return 1;
  356 }
  357 
  358 int
  359 restart_cpus(cpuset_t map)
  360 {
  361 
  362         return (generic_restart_cpus(map, IPI_STOP));
  363 }
  364 
  365 #if defined(__amd64__) || defined(__i386__)
  366 int
  367 resume_cpus(cpuset_t map)
  368 {
  369 
  370         return (generic_restart_cpus(map, IPI_SUSPEND));
  371 }
  372 #endif
  373 
  374 /*
  375  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
  376  * (if specified), rendezvous, execute the action function (if specified),
  377  * rendezvous again, execute the teardown function (if specified), and then
  378  * resume.
  379  *
  380  * Note that the supplied external functions _must_ be reentrant and aware
  381  * that they are running in parallel and in an unknown lock context.
  382  */
  383 void
  384 smp_rendezvous_action(void)
  385 {
  386         struct thread *td;
  387         void *local_func_arg;
  388         void (*local_setup_func)(void*);
  389         void (*local_action_func)(void*);
  390         void (*local_teardown_func)(void*);
  391 #ifdef INVARIANTS
  392         int owepreempt;
  393 #endif
  394 
  395         /* Ensure we have up-to-date values. */
  396         atomic_add_acq_int(&smp_rv_waiters[0], 1);
  397         while (smp_rv_waiters[0] < smp_rv_ncpus)
  398                 cpu_spinwait();
  399 
  400         /* Fetch rendezvous parameters after acquire barrier. */
  401         local_func_arg = smp_rv_func_arg;
  402         local_setup_func = smp_rv_setup_func;
  403         local_action_func = smp_rv_action_func;
  404         local_teardown_func = smp_rv_teardown_func;
  405 
  406         /*
  407          * Use a nested critical section to prevent any preemptions
  408          * from occurring during a rendezvous action routine.
  409          * Specifically, if a rendezvous handler is invoked via an IPI
  410          * and the interrupted thread was in the critical_exit()
  411          * function after setting td_critnest to 0 but before
  412          * performing a deferred preemption, this routine can be
  413          * invoked with td_critnest set to 0 and td_owepreempt true.
  414          * In that case, a critical_exit() during the rendezvous
  415          * action would trigger a preemption which is not permitted in
  416          * a rendezvous action.  To fix this, wrap all of the
  417          * rendezvous action handlers in a critical section.  We
  418          * cannot use a regular critical section however as having
  419          * critical_exit() preempt from this routine would also be
  420          * problematic (the preemption must not occur before the IPI
  421          * has been acknowledged via an EOI).  Instead, we
  422          * intentionally ignore td_owepreempt when leaving the
  423          * critical section.  This should be harmless because we do
  424          * not permit rendezvous action routines to schedule threads,
  425          * and thus td_owepreempt should never transition from 0 to 1
  426          * during this routine.
  427          */
  428         td = curthread;
  429         td->td_critnest++;
  430 #ifdef INVARIANTS
  431         owepreempt = td->td_owepreempt;
  432 #endif
  433         
  434         /*
  435          * If requested, run a setup function before the main action
  436          * function.  Ensure all CPUs have completed the setup
  437          * function before moving on to the action function.
  438          */
  439         if (local_setup_func != smp_no_rendevous_barrier) {
  440                 if (smp_rv_setup_func != NULL)
  441                         smp_rv_setup_func(smp_rv_func_arg);
  442                 atomic_add_int(&smp_rv_waiters[1], 1);
  443                 while (smp_rv_waiters[1] < smp_rv_ncpus)
  444                         cpu_spinwait();
  445         }
  446 
  447         if (local_action_func != NULL)
  448                 local_action_func(local_func_arg);
  449 
  450         if (local_teardown_func != smp_no_rendevous_barrier) {
  451                 /*
  452                  * Signal that the main action has been completed.  If a
  453                  * full exit rendezvous is requested, then all CPUs will
  454                  * wait here until all CPUs have finished the main action.
  455                  */
  456                 atomic_add_int(&smp_rv_waiters[2], 1);
  457                 while (smp_rv_waiters[2] < smp_rv_ncpus)
  458                         cpu_spinwait();
  459 
  460                 if (local_teardown_func != NULL)
  461                         local_teardown_func(local_func_arg);
  462         }
  463 
  464         /*
  465          * Signal that the rendezvous is fully completed by this CPU.
  466          * This means that no member of smp_rv_* pseudo-structure will be
  467          * accessed by this target CPU after this point; in particular,
  468          * memory pointed by smp_rv_func_arg.
  469          *
  470          * The release semantic ensures that all accesses performed by
  471          * the current CPU are visible when smp_rendezvous_cpus()
  472          * returns, by synchronizing with the
  473          * atomic_load_acq_int(&smp_rv_waiters[3]).
  474          */
  475         atomic_add_rel_int(&smp_rv_waiters[3], 1);
  476 
  477         td->td_critnest--;
  478         KASSERT(owepreempt == td->td_owepreempt,
  479             ("rendezvous action changed td_owepreempt"));
  480 }
  481 
  482 void
  483 smp_rendezvous_cpus(cpuset_t map,
  484         void (* setup_func)(void *), 
  485         void (* action_func)(void *),
  486         void (* teardown_func)(void *),
  487         void *arg)
  488 {
  489         int curcpumap, i, ncpus = 0;
  490 
  491         /* Look comments in the !SMP case. */
  492         if (!smp_started) {
  493                 spinlock_enter();
  494                 if (setup_func != NULL)
  495                         setup_func(arg);
  496                 if (action_func != NULL)
  497                         action_func(arg);
  498                 if (teardown_func != NULL)
  499                         teardown_func(arg);
  500                 spinlock_exit();
  501                 return;
  502         }
  503 
  504         CPU_FOREACH(i) {
  505                 if (CPU_ISSET(i, &map))
  506                         ncpus++;
  507         }
  508         if (ncpus == 0)
  509                 panic("ncpus is 0 with non-zero map");
  510 
  511         mtx_lock_spin(&smp_ipi_mtx);
  512 
  513         /* Pass rendezvous parameters via global variables. */
  514         smp_rv_ncpus = ncpus;
  515         smp_rv_setup_func = setup_func;
  516         smp_rv_action_func = action_func;
  517         smp_rv_teardown_func = teardown_func;
  518         smp_rv_func_arg = arg;
  519         smp_rv_waiters[1] = 0;
  520         smp_rv_waiters[2] = 0;
  521         smp_rv_waiters[3] = 0;
  522         atomic_store_rel_int(&smp_rv_waiters[0], 0);
  523 
  524         /*
  525          * Signal other processors, which will enter the IPI with
  526          * interrupts off.
  527          */
  528         curcpumap = CPU_ISSET(curcpu, &map);
  529         CPU_CLR(curcpu, &map);
  530         ipi_selected(map, IPI_RENDEZVOUS);
  531 
  532         /* Check if the current CPU is in the map */
  533         if (curcpumap != 0)
  534                 smp_rendezvous_action();
  535 
  536         /*
  537          * Ensure that the master CPU waits for all the other
  538          * CPUs to finish the rendezvous, so that smp_rv_*
  539          * pseudo-structure and the arg are guaranteed to not
  540          * be in use.
  541          *
  542          * Load acquire synchronizes with the release add in
  543          * smp_rendezvous_action(), which ensures that our caller sees
  544          * all memory actions done by the called functions on other
  545          * CPUs.
  546          */
  547         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
  548                 cpu_spinwait();
  549 
  550         mtx_unlock_spin(&smp_ipi_mtx);
  551 }
  552 
  553 void
  554 smp_rendezvous(void (* setup_func)(void *), 
  555                void (* action_func)(void *),
  556                void (* teardown_func)(void *),
  557                void *arg)
  558 {
  559         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
  560 }
  561 
  562 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
  563 
  564 struct cpu_group *
  565 smp_topo(void)
  566 {
  567         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
  568         struct cpu_group *top;
  569 
  570         /*
  571          * Check for a fake topology request for debugging purposes.
  572          */
  573         switch (smp_topology) {
  574         case 1:
  575                 /* Dual core with no sharing.  */
  576                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
  577                 break;
  578         case 2:
  579                 /* No topology, all cpus are equal. */
  580                 top = smp_topo_none();
  581                 break;
  582         case 3:
  583                 /* Dual core with shared L2.  */
  584                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
  585                 break;
  586         case 4:
  587                 /* quad core, shared l3 among each package, private l2.  */
  588                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
  589                 break;
  590         case 5:
  591                 /* quad core,  2 dualcore parts on each package share l2.  */
  592                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
  593                 break;
  594         case 6:
  595                 /* Single-core 2xHTT */
  596                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
  597                 break;
  598         case 7:
  599                 /* quad core with a shared l3, 8 threads sharing L2.  */
  600                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
  601                     CG_FLAG_SMT);
  602                 break;
  603         default:
  604                 /* Default, ask the system what it wants. */
  605                 top = cpu_topo();
  606                 break;
  607         }
  608         /*
  609          * Verify the returned topology.
  610          */
  611         if (top->cg_count != mp_ncpus)
  612                 panic("Built bad topology at %p.  CPU count %d != %d",
  613                     top, top->cg_count, mp_ncpus);
  614         if (CPU_CMP(&top->cg_mask, &all_cpus))
  615                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
  616                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
  617                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
  618         return (top);
  619 }
  620 
  621 struct cpu_group *
  622 smp_topo_alloc(u_int count)
  623 {
  624         static u_int index;
  625         u_int curr;
  626 
  627         curr = index;
  628         index += count;
  629         return (&group[curr]);
  630 }
  631 
  632 struct cpu_group *
  633 smp_topo_none(void)
  634 {
  635         struct cpu_group *top;
  636 
  637         top = &group[0];
  638         top->cg_parent = NULL;
  639         top->cg_child = NULL;
  640         top->cg_mask = all_cpus;
  641         top->cg_count = mp_ncpus;
  642         top->cg_children = 0;
  643         top->cg_level = CG_SHARE_NONE;
  644         top->cg_flags = 0;
  645         
  646         return (top);
  647 }
  648 
  649 static int
  650 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
  651     int count, int flags, int start)
  652 {
  653         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
  654         cpuset_t mask;
  655         int i;
  656 
  657         CPU_ZERO(&mask);
  658         for (i = 0; i < count; i++, start++)
  659                 CPU_SET(start, &mask);
  660         child->cg_parent = parent;
  661         child->cg_child = NULL;
  662         child->cg_children = 0;
  663         child->cg_level = share;
  664         child->cg_count = count;
  665         child->cg_flags = flags;
  666         child->cg_mask = mask;
  667         parent->cg_children++;
  668         for (; parent != NULL; parent = parent->cg_parent) {
  669                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
  670                         panic("Duplicate children in %p.  mask (%s) child (%s)",
  671                             parent,
  672                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
  673                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
  674                 CPU_OR(&parent->cg_mask, &child->cg_mask);
  675                 parent->cg_count += child->cg_count;
  676         }
  677 
  678         return (start);
  679 }
  680 
  681 struct cpu_group *
  682 smp_topo_1level(int share, int count, int flags)
  683 {
  684         struct cpu_group *child;
  685         struct cpu_group *top;
  686         int packages;
  687         int cpu;
  688         int i;
  689 
  690         cpu = 0;
  691         top = &group[0];
  692         packages = mp_ncpus / count;
  693         top->cg_child = child = &group[1];
  694         top->cg_level = CG_SHARE_NONE;
  695         for (i = 0; i < packages; i++, child++)
  696                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
  697         return (top);
  698 }
  699 
  700 struct cpu_group *
  701 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
  702     int l1flags)
  703 {
  704         struct cpu_group *top;
  705         struct cpu_group *l1g;
  706         struct cpu_group *l2g;
  707         int cpu;
  708         int i;
  709         int j;
  710 
  711         cpu = 0;
  712         top = &group[0];
  713         l2g = &group[1];
  714         top->cg_child = l2g;
  715         top->cg_level = CG_SHARE_NONE;
  716         top->cg_children = mp_ncpus / (l2count * l1count);
  717         l1g = l2g + top->cg_children;
  718         for (i = 0; i < top->cg_children; i++, l2g++) {
  719                 l2g->cg_parent = top;
  720                 l2g->cg_child = l1g;
  721                 l2g->cg_level = l2share;
  722                 for (j = 0; j < l2count; j++, l1g++)
  723                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
  724                             l1flags, cpu);
  725         }
  726         return (top);
  727 }
  728 
  729 
  730 struct cpu_group *
  731 smp_topo_find(struct cpu_group *top, int cpu)
  732 {
  733         struct cpu_group *cg;
  734         cpuset_t mask;
  735         int children;
  736         int i;
  737 
  738         CPU_SETOF(cpu, &mask);
  739         cg = top;
  740         for (;;) {
  741                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
  742                         return (NULL);
  743                 if (cg->cg_children == 0)
  744                         return (cg);
  745                 children = cg->cg_children;
  746                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
  747                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
  748                                 break;
  749         }
  750         return (NULL);
  751 }
  752 #else /* !SMP */
  753 
  754 void
  755 smp_rendezvous_cpus(cpuset_t map,
  756         void (*setup_func)(void *), 
  757         void (*action_func)(void *),
  758         void (*teardown_func)(void *),
  759         void *arg)
  760 {
  761         /*
  762          * In the !SMP case we just need to ensure the same initial conditions
  763          * as the SMP case.
  764          */
  765         spinlock_enter();
  766         if (setup_func != NULL)
  767                 setup_func(arg);
  768         if (action_func != NULL)
  769                 action_func(arg);
  770         if (teardown_func != NULL)
  771                 teardown_func(arg);
  772         spinlock_exit();
  773 }
  774 
  775 void
  776 smp_rendezvous(void (*setup_func)(void *), 
  777                void (*action_func)(void *),
  778                void (*teardown_func)(void *),
  779                void *arg)
  780 {
  781 
  782         /* Look comments in the smp_rendezvous_cpus() case. */
  783         spinlock_enter();
  784         if (setup_func != NULL)
  785                 setup_func(arg);
  786         if (action_func != NULL)
  787                 action_func(arg);
  788         if (teardown_func != NULL)
  789                 teardown_func(arg);
  790         spinlock_exit();
  791 }
  792 
  793 /*
  794  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
  795  * APIs will still work using this dummy support.
  796  */
  797 static void
  798 mp_setvariables_for_up(void *dummy)
  799 {
  800         mp_ncpus = 1;
  801         mp_maxid = PCPU_GET(cpuid);
  802         CPU_SETOF(mp_maxid, &all_cpus);
  803         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
  804 }
  805 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
  806     mp_setvariables_for_up, NULL);
  807 #endif /* SMP */
  808 
  809 void
  810 smp_no_rendevous_barrier(void *dummy)
  811 {
  812 #ifdef SMP
  813         KASSERT((!smp_started),("smp_no_rendevous called and smp is started"));
  814 #endif
  815 }
  816 
  817 /*
  818  * Wait specified idle threads to switch once.  This ensures that even
  819  * preempted threads have cycled through the switch function once,
  820  * exiting their codepaths.  This allows us to change global pointers
  821  * with no other synchronization.
  822  */
  823 int
  824 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
  825 {
  826         struct pcpu *pcpu;
  827         u_int gen[MAXCPU];
  828         int error;
  829         int cpu;
  830 
  831         error = 0;
  832         for (cpu = 0; cpu <= mp_maxid; cpu++) {
  833                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
  834                         continue;
  835                 pcpu = pcpu_find(cpu);
  836                 gen[cpu] = pcpu->pc_idlethread->td_generation;
  837         }
  838         for (cpu = 0; cpu <= mp_maxid; cpu++) {
  839                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
  840                         continue;
  841                 pcpu = pcpu_find(cpu);
  842                 thread_lock(curthread);
  843                 sched_bind(curthread, cpu);
  844                 thread_unlock(curthread);
  845                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
  846                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
  847                         if (error != EWOULDBLOCK)
  848                                 goto out;
  849                         error = 0;
  850                 }
  851         }
  852 out:
  853         thread_lock(curthread);
  854         sched_unbind(curthread);
  855         thread_unlock(curthread);
  856 
  857         return (error);
  858 }
  859 
  860 int
  861 quiesce_all_cpus(const char *wmesg, int prio)
  862 {
  863 
  864         return quiesce_cpus(all_cpus, wmesg, prio);
  865 }
  866 
  867 /* Extra care is taken with this sysctl because the data type is volatile */
  868 static int
  869 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
  870 {
  871         int error, active;
  872 
  873         active = smp_started;
  874         error = SYSCTL_OUT(req, &active, sizeof(active));
  875         return (error);
  876 }
  877 
  878 
  879 #ifdef SMP
  880 void
  881 topo_init_node(struct topo_node *node)
  882 {
  883 
  884         bzero(node, sizeof(*node));
  885         TAILQ_INIT(&node->children);
  886 }
  887 
  888 void
  889 topo_init_root(struct topo_node *root)
  890 {
  891 
  892         topo_init_node(root);
  893         root->type = TOPO_TYPE_SYSTEM;
  894 }
  895 
  896 /*
  897  * Add a child node with the given ID under the given parent.
  898  * Do nothing if there is already a child with that ID.
  899  */
  900 struct topo_node *
  901 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
  902     topo_node_type type, uintptr_t subtype)
  903 {
  904         struct topo_node *node;
  905 
  906         TAILQ_FOREACH_REVERSE(node, &parent->children,
  907             topo_children, siblings) {
  908                 if (node->hwid == hwid
  909                     && node->type == type && node->subtype == subtype) {
  910                         return (node);
  911                 }
  912         }
  913 
  914         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
  915         topo_init_node(node);
  916         node->parent = parent;
  917         node->hwid = hwid;
  918         node->type = type;
  919         node->subtype = subtype;
  920         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
  921         parent->nchildren++;
  922 
  923         return (node);
  924 }
  925 
  926 /*
  927  * Find a child node with the given ID under the given parent.
  928  */
  929 struct topo_node *
  930 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
  931     topo_node_type type, uintptr_t subtype)
  932 {
  933 
  934         struct topo_node *node;
  935 
  936         TAILQ_FOREACH(node, &parent->children, siblings) {
  937                 if (node->hwid == hwid
  938                     && node->type == type && node->subtype == subtype) {
  939                         return (node);
  940                 }
  941         }
  942 
  943         return (NULL);
  944 }
  945 
  946 /*
  947  * Given a node change the order of its parent's child nodes such
  948  * that the node becomes the firt child while preserving the cyclic
  949  * order of the children.  In other words, the given node is promoted
  950  * by rotation.
  951  */
  952 void
  953 topo_promote_child(struct topo_node *child)
  954 {
  955         struct topo_node *next;
  956         struct topo_node *node;
  957         struct topo_node *parent;
  958 
  959         parent = child->parent;
  960         next = TAILQ_NEXT(child, siblings);
  961         TAILQ_REMOVE(&parent->children, child, siblings);
  962         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
  963 
  964         while (next != NULL) {
  965                 node = next;
  966                 next = TAILQ_NEXT(node, siblings);
  967                 TAILQ_REMOVE(&parent->children, node, siblings);
  968                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
  969                 child = node;
  970         }
  971 }
  972 
  973 /*
  974  * Iterate to the next node in the depth-first search (traversal) of
  975  * the topology tree.
  976  */
  977 struct topo_node *
  978 topo_next_node(struct topo_node *top, struct topo_node *node)
  979 {
  980         struct topo_node *next;
  981 
  982         if ((next = TAILQ_FIRST(&node->children)) != NULL)
  983                 return (next);
  984 
  985         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
  986                 return (next);
  987 
  988         while ((node = node->parent) != top)
  989                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
  990                         return (next);
  991 
  992         return (NULL);
  993 }
  994 
  995 /*
  996  * Iterate to the next node in the depth-first search of the topology tree,
  997  * but without descending below the current node.
  998  */
  999 struct topo_node *
 1000 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
 1001 {
 1002         struct topo_node *next;
 1003 
 1004         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1005                 return (next);
 1006 
 1007         while ((node = node->parent) != top)
 1008                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1009                         return (next);
 1010 
 1011         return (NULL);
 1012 }
 1013 
 1014 /*
 1015  * Assign the given ID to the given topology node that represents a logical
 1016  * processor.
 1017  */
 1018 void
 1019 topo_set_pu_id(struct topo_node *node, cpuid_t id)
 1020 {
 1021 
 1022         KASSERT(node->type == TOPO_TYPE_PU,
 1023             ("topo_set_pu_id: wrong node type: %u", node->type));
 1024         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
 1025             ("topo_set_pu_id: cpuset already not empty"));
 1026         node->id = id;
 1027         CPU_SET(id, &node->cpuset);
 1028         node->cpu_count = 1;
 1029         node->subtype = 1;
 1030 
 1031         while ((node = node->parent) != NULL) {
 1032                 KASSERT(!CPU_ISSET(id, &node->cpuset),
 1033                     ("logical ID %u is already set in node %p", id, node));
 1034                 CPU_SET(id, &node->cpuset);
 1035                 node->cpu_count++;
 1036         }
 1037 }
 1038 
 1039 /*
 1040  * Check if the topology is uniform, that is, each package has the same number
 1041  * of cores in it and each core has the same number of threads (logical
 1042  * processors) in it.  If so, calculate the number of package, the number of
 1043  * cores per package and the number of logical processors per core.
 1044  * 'all' parameter tells whether to include administratively disabled logical
 1045  * processors into the analysis.
 1046  */
 1047 int
 1048 topo_analyze(struct topo_node *topo_root, int all,
 1049     int *pkg_count, int *cores_per_pkg, int *thrs_per_core)
 1050 {
 1051         struct topo_node *pkg_node;
 1052         struct topo_node *core_node;
 1053         struct topo_node *pu_node;
 1054         int thrs_per_pkg;
 1055         int cpp_counter;
 1056         int tpc_counter;
 1057         int tpp_counter;
 1058 
 1059         *pkg_count = 0;
 1060         *cores_per_pkg = -1;
 1061         *thrs_per_core = -1;
 1062         thrs_per_pkg = -1;
 1063         pkg_node = topo_root;
 1064         while (pkg_node != NULL) {
 1065                 if (pkg_node->type != TOPO_TYPE_PKG) {
 1066                         pkg_node = topo_next_node(topo_root, pkg_node);
 1067                         continue;
 1068                 }
 1069                 if (!all && CPU_EMPTY(&pkg_node->cpuset)) {
 1070                         pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
 1071                         continue;
 1072                 }
 1073 
 1074                 (*pkg_count)++;
 1075 
 1076                 cpp_counter = 0;
 1077                 tpp_counter = 0;
 1078                 core_node = pkg_node;
 1079                 while (core_node != NULL) {
 1080                         if (core_node->type == TOPO_TYPE_CORE) {
 1081                                 if (!all && CPU_EMPTY(&core_node->cpuset)) {
 1082                                         core_node =
 1083                                             topo_next_nonchild_node(pkg_node,
 1084                                                 core_node);
 1085                                         continue;
 1086                                 }
 1087 
 1088                                 cpp_counter++;
 1089 
 1090                                 tpc_counter = 0;
 1091                                 pu_node = core_node;
 1092                                 while (pu_node != NULL) {
 1093                                         if (pu_node->type == TOPO_TYPE_PU &&
 1094                                             (all || !CPU_EMPTY(&pu_node->cpuset)))
 1095                                                 tpc_counter++;
 1096                                         pu_node = topo_next_node(core_node,
 1097                                             pu_node);
 1098                                 }
 1099 
 1100                                 if (*thrs_per_core == -1)
 1101                                         *thrs_per_core = tpc_counter;
 1102                                 else if (*thrs_per_core != tpc_counter)
 1103                                         return (0);
 1104 
 1105                                 core_node = topo_next_nonchild_node(pkg_node,
 1106                                     core_node);
 1107                         } else {
 1108                                 /* PU node directly under PKG. */
 1109                                 if (core_node->type == TOPO_TYPE_PU &&
 1110                                    (all || !CPU_EMPTY(&core_node->cpuset)))
 1111                                         tpp_counter++;
 1112                                 core_node = topo_next_node(pkg_node,
 1113                                     core_node);
 1114                         }
 1115                 }
 1116 
 1117                 if (*cores_per_pkg == -1)
 1118                         *cores_per_pkg = cpp_counter;
 1119                 else if (*cores_per_pkg != cpp_counter)
 1120                         return (0);
 1121                 if (thrs_per_pkg == -1)
 1122                         thrs_per_pkg = tpp_counter;
 1123                 else if (thrs_per_pkg != tpp_counter)
 1124                         return (0);
 1125 
 1126                 pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
 1127         }
 1128 
 1129         KASSERT(*pkg_count > 0,
 1130                 ("bug in topology or analysis"));
 1131         if (*cores_per_pkg == 0) {
 1132                 KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0,
 1133                         ("bug in topology or analysis"));
 1134                 *thrs_per_core = thrs_per_pkg;
 1135         }
 1136 
 1137         return (1);
 1138 }
 1139 #endif /* SMP */
 1140 

Cache object: 129a5d59d2ef999b91cb65db2f8229b9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.