subr_smp.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
    3  *
    4  * Redistribution and use in source and binary forms, with or without
    5  * modification, are permitted provided that the following conditions
    6  * are met:
    7  * 1. Redistributions of source code must retain the above copyright
    8  *    notice, this list of conditions and the following disclaimer.
    9  * 2. Redistributions in binary form must reproduce the above copyright
   10  *    notice, this list of conditions and the following disclaimer in the
   11  *    documentation and/or other materials provided with the distribution.
   12  *
   13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   23  * SUCH DAMAGE.
   24  */
   25 
   26 /*
   27  * This module holds the global variables and machine independent functions
   28  * used for the kernel SMP support.
   29  */
   30 
   31 #include <sys/cdefs.h>
   32 __FBSDID("$FreeBSD$");
   33 
   34 #include <sys/param.h>
   35 #include <sys/systm.h>
   36 #include <sys/kernel.h>
   37 #include <sys/ktr.h>
   38 #include <sys/proc.h>
   39 #include <sys/bus.h>
   40 #include <sys/lock.h>
   41 #include <sys/malloc.h>
   42 #include <sys/mutex.h>
   43 #include <sys/pcpu.h>
   44 #include <sys/sched.h>
   45 #include <sys/smp.h>
   46 #include <sys/sysctl.h>
   47 
   48 #include <machine/cpu.h>
   49 #include <machine/smp.h>
   50 
   51 #include "opt_sched.h"
   52 
   53 #ifdef SMP
   54 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
   55 
   56 volatile cpuset_t stopped_cpus;
   57 volatile cpuset_t started_cpus;
   58 volatile cpuset_t suspended_cpus;
   59 cpuset_t hlt_cpus_mask;
   60 cpuset_t logical_cpus_mask;
   61 
   62 void (*cpustop_restartfunc)(void);
   63 #endif
   64 
   65 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
   66 
   67 /* This is used in modules that need to work in both SMP and UP. */
   68 cpuset_t all_cpus;
   69 
   70 int mp_ncpus;
   71 /* export this for libkvm consumers. */
   72 int mp_maxcpus = MAXCPU;
   73 
   74 volatile int smp_started;
   75 u_int mp_maxid;
   76 
   77 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
   78     "Kernel SMP");
   79 
   80 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
   81     "Max CPU ID.");
   82 
   83 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
   84     0, "Max number of CPUs that the system was compiled for.");
   85 
   86 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD | CTLTYPE_INT, NULL, 0,
   87     sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode");
   88 
   89 int smp_disabled = 0;   /* has smp been disabled? */
   90 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
   91     &smp_disabled, 0, "SMP has been disabled from the loader");
   92 
   93 int smp_cpus = 1;       /* how many cpu's running */
   94 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
   95     "Number of CPUs online");
   96 
   97 int smp_topology = 0;   /* Which topology we're using. */
   98 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
   99     "Topology override setting; 0 is default provided by hardware.");
  100 
  101 #ifdef SMP
  102 /* Enable forwarding of a signal to a process running on a different CPU */
  103 static int forward_signal_enabled = 1;
  104 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
  105            &forward_signal_enabled, 0,
  106            "Forwarding of a signal to a process on a different CPU");
  107 
  108 /* Variables needed for SMP rendezvous. */
  109 static volatile int smp_rv_ncpus;
  110 static void (*volatile smp_rv_setup_func)(void *arg);
  111 static void (*volatile smp_rv_action_func)(void *arg);
  112 static void (*volatile smp_rv_teardown_func)(void *arg);
  113 static void *volatile smp_rv_func_arg;
  114 static volatile int smp_rv_waiters[4];
  115 
  116 /* 
  117  * Shared mutex to restrict busywaits between smp_rendezvous() and
  118  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  119  * functions trigger at once and cause multiple CPUs to busywait with
  120  * interrupts disabled. 
  121  */
  122 struct mtx smp_ipi_mtx;
  123 
  124 /*
  125  * Let the MD SMP code initialize mp_maxid very early if it can.
  126  */
  127 static void
  128 mp_setmaxid(void *dummy)
  129 {
  130 
  131         cpu_mp_setmaxid();
  132 
  133         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
  134         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
  135             ("%s: one CPU but mp_maxid is not zero", __func__));
  136         KASSERT(mp_maxid >= mp_ncpus - 1,
  137             ("%s: counters out of sync: max %d, count %d", __func__,
  138                 mp_maxid, mp_ncpus));
  139 }
  140 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
  141 
  142 /*
  143  * Call the MD SMP initialization code.
  144  */
  145 static void
  146 mp_start(void *dummy)
  147 {
  148 
  149         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
  150 
  151         /* Probe for MP hardware. */
  152         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
  153                 mp_ncpus = 1;
  154                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
  155                 return;
  156         }
  157 
  158         cpu_mp_start();
  159         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
  160             mp_ncpus);
  161         cpu_mp_announce();
  162 }
  163 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
  164 
  165 void
  166 forward_signal(struct thread *td)
  167 {
  168         int id;
  169 
  170         /*
  171          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
  172          * this thread, so all we need to do is poke it if it is currently
  173          * executing so that it executes ast().
  174          */
  175         THREAD_LOCK_ASSERT(td, MA_OWNED);
  176         KASSERT(TD_IS_RUNNING(td),
  177             ("forward_signal: thread is not TDS_RUNNING"));
  178 
  179         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
  180 
  181         if (!smp_started || cold || panicstr)
  182                 return;
  183         if (!forward_signal_enabled)
  184                 return;
  185 
  186         /* No need to IPI ourself. */
  187         if (td == curthread)
  188                 return;
  189 
  190         id = td->td_oncpu;
  191         if (id == NOCPU)
  192                 return;
  193         ipi_cpu(id, IPI_AST);
  194 }
  195 
  196 /*
  197  * When called the executing CPU will send an IPI to all other CPUs
  198  *  requesting that they halt execution.
  199  *
  200  * Usually (but not necessarily) called with 'other_cpus' as its arg.
  201  *
  202  *  - Signals all CPUs in map to stop.
  203  *  - Waits for each to stop.
  204  *
  205  * Returns:
  206  *  -1: error
  207  *   0: NA
  208  *   1: ok
  209  *
  210  */
  211 #if defined(__amd64__) || defined(__i386__)
  212 #define X86     1
  213 #else
  214 #define X86     0
  215 #endif
  216 static int
  217 generic_stop_cpus(cpuset_t map, u_int type)
  218 {
  219 #ifdef KTR
  220         char cpusetbuf[CPUSETBUFSIZ];
  221 #endif
  222         static volatile u_int stopping_cpu = NOCPU;
  223         int i;
  224         volatile cpuset_t *cpus;
  225 
  226         KASSERT(
  227             type == IPI_STOP || type == IPI_STOP_HARD
  228 #if X86
  229             || type == IPI_SUSPEND
  230 #endif
  231             , ("%s: invalid stop type", __func__));
  232 
  233         if (!smp_started)
  234                 return (0);
  235 
  236         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
  237             cpusetobj_strprint(cpusetbuf, &map), type);
  238 
  239 #if X86
  240         /*
  241          * When suspending, ensure there are are no IPIs in progress.
  242          * IPIs that have been issued, but not yet delivered (e.g.
  243          * not pending on a vCPU when running under virtualization)
  244          * will be lost, violating FreeBSD's assumption of reliable
  245          * IPI delivery.
  246          */
  247         if (type == IPI_SUSPEND)
  248                 mtx_lock_spin(&smp_ipi_mtx);
  249 #endif
  250 
  251 #if X86
  252         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
  253 #endif
  254         if (stopping_cpu != PCPU_GET(cpuid))
  255                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
  256                     PCPU_GET(cpuid)) == 0)
  257                         while (stopping_cpu != NOCPU)
  258                                 cpu_spinwait(); /* spin */
  259 
  260         /* send the stop IPI to all CPUs in map */
  261         ipi_selected(map, type);
  262 #if X86
  263         }
  264 #endif
  265 
  266 #if X86
  267         if (type == IPI_SUSPEND)
  268                 cpus = &suspended_cpus;
  269         else
  270 #endif
  271                 cpus = &stopped_cpus;
  272 
  273         i = 0;
  274         while (!CPU_SUBSET(cpus, &map)) {
  275                 /* spin */
  276                 cpu_spinwait();
  277                 i++;
  278                 if (i == 100000000) {
  279                         printf("timeout stopping cpus\n");
  280                         break;
  281                 }
  282         }
  283 
  284 #if X86
  285         if (type == IPI_SUSPEND)
  286                 mtx_unlock_spin(&smp_ipi_mtx);
  287 #endif
  288 
  289         stopping_cpu = NOCPU;
  290         return (1);
  291 }
  292 
  293 int
  294 stop_cpus(cpuset_t map)
  295 {
  296 
  297         return (generic_stop_cpus(map, IPI_STOP));
  298 }
  299 
  300 int
  301 stop_cpus_hard(cpuset_t map)
  302 {
  303 
  304         return (generic_stop_cpus(map, IPI_STOP_HARD));
  305 }
  306 
  307 #if X86
  308 int
  309 suspend_cpus(cpuset_t map)
  310 {
  311 
  312         return (generic_stop_cpus(map, IPI_SUSPEND));
  313 }
  314 #endif
  315 
  316 /*
  317  * Called by a CPU to restart stopped CPUs. 
  318  *
  319  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
  320  *
  321  *  - Signals all CPUs in map to restart.
  322  *  - Waits for each to restart.
  323  *
  324  * Returns:
  325  *  -1: error
  326  *   0: NA
  327  *   1: ok
  328  */
  329 static int
  330 generic_restart_cpus(cpuset_t map, u_int type)
  331 {
  332 #ifdef KTR
  333         char cpusetbuf[CPUSETBUFSIZ];
  334 #endif
  335         volatile cpuset_t *cpus;
  336 
  337         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
  338 #if X86
  339             || type == IPI_SUSPEND
  340 #endif
  341             , ("%s: invalid stop type", __func__));
  342 
  343         if (!smp_started)
  344                 return (0);
  345 
  346         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
  347 
  348 #if X86
  349         if (type == IPI_SUSPEND)
  350                 cpus = &resuming_cpus;
  351         else
  352 #endif
  353                 cpus = &stopped_cpus;
  354 
  355         /* signal other cpus to restart */
  356 #if X86
  357         if (type == IPI_SUSPEND)
  358                 CPU_COPY_STORE_REL(&map, &toresume_cpus);
  359         else
  360 #endif
  361                 CPU_COPY_STORE_REL(&map, &started_cpus);
  362 
  363 #if X86
  364         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
  365 #endif
  366         /* wait for each to clear its bit */
  367         while (CPU_OVERLAP(cpus, &map))
  368                 cpu_spinwait();
  369 #if X86
  370         }
  371 #endif
  372 
  373         return (1);
  374 }
  375 
  376 int
  377 restart_cpus(cpuset_t map)
  378 {
  379 
  380         return (generic_restart_cpus(map, IPI_STOP));
  381 }
  382 
  383 #if X86
  384 int
  385 resume_cpus(cpuset_t map)
  386 {
  387 
  388         return (generic_restart_cpus(map, IPI_SUSPEND));
  389 }
  390 #endif
  391 #undef X86
  392 
  393 /*
  394  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
  395  * (if specified), rendezvous, execute the action function (if specified),
  396  * rendezvous again, execute the teardown function (if specified), and then
  397  * resume.
  398  *
  399  * Note that the supplied external functions _must_ be reentrant and aware
  400  * that they are running in parallel and in an unknown lock context.
  401  */
  402 void
  403 smp_rendezvous_action(void)
  404 {
  405         struct thread *td;
  406         void *local_func_arg;
  407         void (*local_setup_func)(void*);
  408         void (*local_action_func)(void*);
  409         void (*local_teardown_func)(void*);
  410 #ifdef INVARIANTS
  411         int owepreempt;
  412 #endif
  413 
  414         /* Ensure we have up-to-date values. */
  415         atomic_add_acq_int(&smp_rv_waiters[0], 1);
  416         while (smp_rv_waiters[0] < smp_rv_ncpus)
  417                 cpu_spinwait();
  418 
  419         /* Fetch rendezvous parameters after acquire barrier. */
  420         local_func_arg = smp_rv_func_arg;
  421         local_setup_func = smp_rv_setup_func;
  422         local_action_func = smp_rv_action_func;
  423         local_teardown_func = smp_rv_teardown_func;
  424 
  425         /*
  426          * Use a nested critical section to prevent any preemptions
  427          * from occurring during a rendezvous action routine.
  428          * Specifically, if a rendezvous handler is invoked via an IPI
  429          * and the interrupted thread was in the critical_exit()
  430          * function after setting td_critnest to 0 but before
  431          * performing a deferred preemption, this routine can be
  432          * invoked with td_critnest set to 0 and td_owepreempt true.
  433          * In that case, a critical_exit() during the rendezvous
  434          * action would trigger a preemption which is not permitted in
  435          * a rendezvous action.  To fix this, wrap all of the
  436          * rendezvous action handlers in a critical section.  We
  437          * cannot use a regular critical section however as having
  438          * critical_exit() preempt from this routine would also be
  439          * problematic (the preemption must not occur before the IPI
  440          * has been acknowledged via an EOI).  Instead, we
  441          * intentionally ignore td_owepreempt when leaving the
  442          * critical section.  This should be harmless because we do
  443          * not permit rendezvous action routines to schedule threads,
  444          * and thus td_owepreempt should never transition from 0 to 1
  445          * during this routine.
  446          */
  447         td = curthread;
  448         td->td_critnest++;
  449 #ifdef INVARIANTS
  450         owepreempt = td->td_owepreempt;
  451 #endif
  452         
  453         /*
  454          * If requested, run a setup function before the main action
  455          * function.  Ensure all CPUs have completed the setup
  456          * function before moving on to the action function.
  457          */
  458         if (local_setup_func != smp_no_rendezvous_barrier) {
  459                 if (smp_rv_setup_func != NULL)
  460                         smp_rv_setup_func(smp_rv_func_arg);
  461                 atomic_add_int(&smp_rv_waiters[1], 1);
  462                 while (smp_rv_waiters[1] < smp_rv_ncpus)
  463                         cpu_spinwait();
  464         }
  465 
  466         if (local_action_func != NULL)
  467                 local_action_func(local_func_arg);
  468 
  469         if (local_teardown_func != smp_no_rendezvous_barrier) {
  470                 /*
  471                  * Signal that the main action has been completed.  If a
  472                  * full exit rendezvous is requested, then all CPUs will
  473                  * wait here until all CPUs have finished the main action.
  474                  */
  475                 atomic_add_int(&smp_rv_waiters[2], 1);
  476                 while (smp_rv_waiters[2] < smp_rv_ncpus)
  477                         cpu_spinwait();
  478 
  479                 if (local_teardown_func != NULL)
  480                         local_teardown_func(local_func_arg);
  481         }
  482 
  483         /*
  484          * Signal that the rendezvous is fully completed by this CPU.
  485          * This means that no member of smp_rv_* pseudo-structure will be
  486          * accessed by this target CPU after this point; in particular,
  487          * memory pointed by smp_rv_func_arg.
  488          *
  489          * The release semantic ensures that all accesses performed by
  490          * the current CPU are visible when smp_rendezvous_cpus()
  491          * returns, by synchronizing with the
  492          * atomic_load_acq_int(&smp_rv_waiters[3]).
  493          */
  494         atomic_add_rel_int(&smp_rv_waiters[3], 1);
  495 
  496         td->td_critnest--;
  497         KASSERT(owepreempt == td->td_owepreempt,
  498             ("rendezvous action changed td_owepreempt"));
  499 }
  500 
  501 void
  502 smp_rendezvous_cpus(cpuset_t map,
  503         void (* setup_func)(void *), 
  504         void (* action_func)(void *),
  505         void (* teardown_func)(void *),
  506         void *arg)
  507 {
  508         int curcpumap, i, ncpus = 0;
  509 
  510         /* Look comments in the !SMP case. */
  511         if (!smp_started) {
  512                 spinlock_enter();
  513                 if (setup_func != NULL)
  514                         setup_func(arg);
  515                 if (action_func != NULL)
  516                         action_func(arg);
  517                 if (teardown_func != NULL)
  518                         teardown_func(arg);
  519                 spinlock_exit();
  520                 return;
  521         }
  522 
  523         CPU_FOREACH(i) {
  524                 if (CPU_ISSET(i, &map))
  525                         ncpus++;
  526         }
  527         if (ncpus == 0)
  528                 panic("ncpus is 0 with non-zero map");
  529 
  530         mtx_lock_spin(&smp_ipi_mtx);
  531 
  532         /* Pass rendezvous parameters via global variables. */
  533         smp_rv_ncpus = ncpus;
  534         smp_rv_setup_func = setup_func;
  535         smp_rv_action_func = action_func;
  536         smp_rv_teardown_func = teardown_func;
  537         smp_rv_func_arg = arg;
  538         smp_rv_waiters[1] = 0;
  539         smp_rv_waiters[2] = 0;
  540         smp_rv_waiters[3] = 0;
  541         atomic_store_rel_int(&smp_rv_waiters[0], 0);
  542 
  543         /*
  544          * Signal other processors, which will enter the IPI with
  545          * interrupts off.
  546          */
  547         curcpumap = CPU_ISSET(curcpu, &map);
  548         CPU_CLR(curcpu, &map);
  549         ipi_selected(map, IPI_RENDEZVOUS);
  550 
  551         /* Check if the current CPU is in the map */
  552         if (curcpumap != 0)
  553                 smp_rendezvous_action();
  554 
  555         /*
  556          * Ensure that the master CPU waits for all the other
  557          * CPUs to finish the rendezvous, so that smp_rv_*
  558          * pseudo-structure and the arg are guaranteed to not
  559          * be in use.
  560          *
  561          * Load acquire synchronizes with the release add in
  562          * smp_rendezvous_action(), which ensures that our caller sees
  563          * all memory actions done by the called functions on other
  564          * CPUs.
  565          */
  566         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
  567                 cpu_spinwait();
  568 
  569         mtx_unlock_spin(&smp_ipi_mtx);
  570 }
  571 
  572 void
  573 smp_rendezvous(void (* setup_func)(void *), 
  574                void (* action_func)(void *),
  575                void (* teardown_func)(void *),
  576                void *arg)
  577 {
  578         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
  579 }
  580 
  581 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
  582 
  583 struct cpu_group *
  584 smp_topo(void)
  585 {
  586         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
  587         struct cpu_group *top;
  588 
  589         /*
  590          * Check for a fake topology request for debugging purposes.
  591          */
  592         switch (smp_topology) {
  593         case 1:
  594                 /* Dual core with no sharing.  */
  595                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
  596                 break;
  597         case 2:
  598                 /* No topology, all cpus are equal. */
  599                 top = smp_topo_none();
  600                 break;
  601         case 3:
  602                 /* Dual core with shared L2.  */
  603                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
  604                 break;
  605         case 4:
  606                 /* quad core, shared l3 among each package, private l2.  */
  607                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
  608                 break;
  609         case 5:
  610                 /* quad core,  2 dualcore parts on each package share l2.  */
  611                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
  612                 break;
  613         case 6:
  614                 /* Single-core 2xHTT */
  615                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
  616                 break;
  617         case 7:
  618                 /* quad core with a shared l3, 8 threads sharing L2.  */
  619                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
  620                     CG_FLAG_SMT);
  621                 break;
  622         default:
  623                 /* Default, ask the system what it wants. */
  624                 top = cpu_topo();
  625                 break;
  626         }
  627         /*
  628          * Verify the returned topology.
  629          */
  630         if (top->cg_count != mp_ncpus)
  631                 panic("Built bad topology at %p.  CPU count %d != %d",
  632                     top, top->cg_count, mp_ncpus);
  633         if (CPU_CMP(&top->cg_mask, &all_cpus))
  634                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
  635                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
  636                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
  637         return (top);
  638 }
  639 
  640 struct cpu_group *
  641 smp_topo_alloc(u_int count)
  642 {
  643         static u_int index;
  644         u_int curr;
  645 
  646         curr = index;
  647         index += count;
  648         return (&group[curr]);
  649 }
  650 
  651 struct cpu_group *
  652 smp_topo_none(void)
  653 {
  654         struct cpu_group *top;
  655 
  656         top = &group[0];
  657         top->cg_parent = NULL;
  658         top->cg_child = NULL;
  659         top->cg_mask = all_cpus;
  660         top->cg_count = mp_ncpus;
  661         top->cg_children = 0;
  662         top->cg_level = CG_SHARE_NONE;
  663         top->cg_flags = 0;
  664         
  665         return (top);
  666 }
  667 
  668 static int
  669 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
  670     int count, int flags, int start)
  671 {
  672         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
  673         cpuset_t mask;
  674         int i;
  675 
  676         CPU_ZERO(&mask);
  677         for (i = 0; i < count; i++, start++)
  678                 CPU_SET(start, &mask);
  679         child->cg_parent = parent;
  680         child->cg_child = NULL;
  681         child->cg_children = 0;
  682         child->cg_level = share;
  683         child->cg_count = count;
  684         child->cg_flags = flags;
  685         child->cg_mask = mask;
  686         parent->cg_children++;
  687         for (; parent != NULL; parent = parent->cg_parent) {
  688                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
  689                         panic("Duplicate children in %p.  mask (%s) child (%s)",
  690                             parent,
  691                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
  692                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
  693                 CPU_OR(&parent->cg_mask, &child->cg_mask);
  694                 parent->cg_count += child->cg_count;
  695         }
  696 
  697         return (start);
  698 }
  699 
  700 struct cpu_group *
  701 smp_topo_1level(int share, int count, int flags)
  702 {
  703         struct cpu_group *child;
  704         struct cpu_group *top;
  705         int packages;
  706         int cpu;
  707         int i;
  708 
  709         cpu = 0;
  710         top = &group[0];
  711         packages = mp_ncpus / count;
  712         top->cg_child = child = &group[1];
  713         top->cg_level = CG_SHARE_NONE;
  714         for (i = 0; i < packages; i++, child++)
  715                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
  716         return (top);
  717 }
  718 
  719 struct cpu_group *
  720 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
  721     int l1flags)
  722 {
  723         struct cpu_group *top;
  724         struct cpu_group *l1g;
  725         struct cpu_group *l2g;
  726         int cpu;
  727         int i;
  728         int j;
  729 
  730         cpu = 0;
  731         top = &group[0];
  732         l2g = &group[1];
  733         top->cg_child = l2g;
  734         top->cg_level = CG_SHARE_NONE;
  735         top->cg_children = mp_ncpus / (l2count * l1count);
  736         l1g = l2g + top->cg_children;
  737         for (i = 0; i < top->cg_children; i++, l2g++) {
  738                 l2g->cg_parent = top;
  739                 l2g->cg_child = l1g;
  740                 l2g->cg_level = l2share;
  741                 for (j = 0; j < l2count; j++, l1g++)
  742                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
  743                             l1flags, cpu);
  744         }
  745         return (top);
  746 }
  747 
  748 
  749 struct cpu_group *
  750 smp_topo_find(struct cpu_group *top, int cpu)
  751 {
  752         struct cpu_group *cg;
  753         cpuset_t mask;
  754         int children;
  755         int i;
  756 
  757         CPU_SETOF(cpu, &mask);
  758         cg = top;
  759         for (;;) {
  760                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
  761                         return (NULL);
  762                 if (cg->cg_children == 0)
  763                         return (cg);
  764                 children = cg->cg_children;
  765                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
  766                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
  767                                 break;
  768         }
  769         return (NULL);
  770 }
  771 #else /* !SMP */
  772 
  773 void
  774 smp_rendezvous_cpus(cpuset_t map,
  775         void (*setup_func)(void *), 
  776         void (*action_func)(void *),
  777         void (*teardown_func)(void *),
  778         void *arg)
  779 {
  780         /*
  781          * In the !SMP case we just need to ensure the same initial conditions
  782          * as the SMP case.
  783          */
  784         spinlock_enter();
  785         if (setup_func != NULL)
  786                 setup_func(arg);
  787         if (action_func != NULL)
  788                 action_func(arg);
  789         if (teardown_func != NULL)
  790                 teardown_func(arg);
  791         spinlock_exit();
  792 }
  793 
  794 void
  795 smp_rendezvous(void (*setup_func)(void *), 
  796                void (*action_func)(void *),
  797                void (*teardown_func)(void *),
  798                void *arg)
  799 {
  800 
  801         /* Look comments in the smp_rendezvous_cpus() case. */
  802         spinlock_enter();
  803         if (setup_func != NULL)
  804                 setup_func(arg);
  805         if (action_func != NULL)
  806                 action_func(arg);
  807         if (teardown_func != NULL)
  808                 teardown_func(arg);
  809         spinlock_exit();
  810 }
  811 
  812 /*
  813  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
  814  * APIs will still work using this dummy support.
  815  */
  816 static void
  817 mp_setvariables_for_up(void *dummy)
  818 {
  819         mp_ncpus = 1;
  820         mp_maxid = PCPU_GET(cpuid);
  821         CPU_SETOF(mp_maxid, &all_cpus);
  822         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
  823 }
  824 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
  825     mp_setvariables_for_up, NULL);
  826 #endif /* SMP */
  827 
  828 /*
  829  * smp_no_rendevous_barrier was renamed to smp_no_rendezvous_barrier
  830  * in __FreeBSD_version 1101508, with the old name remaining in 11.x
  831  * as an alias for compatibility.  The old name will be gone in 12.0
  832  * (__FreeBSD_version >= 1200028).
  833  */
  834 __strong_reference(smp_no_rendezvous_barrier, smp_no_rendevous_barrier);
  835 void
  836 smp_no_rendezvous_barrier(void *dummy)
  837 {
  838 #ifdef SMP
  839         KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
  840 #endif
  841 }
  842 
  843 /*
  844  * Wait specified idle threads to switch once.  This ensures that even
  845  * preempted threads have cycled through the switch function once,
  846  * exiting their codepaths.  This allows us to change global pointers
  847  * with no other synchronization.
  848  */
  849 int
  850 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
  851 {
  852         struct pcpu *pcpu;
  853         u_int gen[MAXCPU];
  854         int error;
  855         int cpu;
  856 
  857         error = 0;
  858         for (cpu = 0; cpu <= mp_maxid; cpu++) {
  859                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
  860                         continue;
  861                 pcpu = pcpu_find(cpu);
  862                 gen[cpu] = pcpu->pc_idlethread->td_generation;
  863         }
  864         for (cpu = 0; cpu <= mp_maxid; cpu++) {
  865                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
  866                         continue;
  867                 pcpu = pcpu_find(cpu);
  868                 thread_lock(curthread);
  869                 sched_bind(curthread, cpu);
  870                 thread_unlock(curthread);
  871                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
  872                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
  873                         if (error != EWOULDBLOCK)
  874                                 goto out;
  875                         error = 0;
  876                 }
  877         }
  878 out:
  879         thread_lock(curthread);
  880         sched_unbind(curthread);
  881         thread_unlock(curthread);
  882 
  883         return (error);
  884 }
  885 
  886 int
  887 quiesce_all_cpus(const char *wmesg, int prio)
  888 {
  889 
  890         return quiesce_cpus(all_cpus, wmesg, prio);
  891 }
  892 
  893 /* Extra care is taken with this sysctl because the data type is volatile */
  894 static int
  895 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
  896 {
  897         int error, active;
  898 
  899         active = smp_started;
  900         error = SYSCTL_OUT(req, &active, sizeof(active));
  901         return (error);
  902 }
  903 
  904 
  905 #ifdef SMP
  906 void
  907 topo_init_node(struct topo_node *node)
  908 {
  909 
  910         bzero(node, sizeof(*node));
  911         TAILQ_INIT(&node->children);
  912 }
  913 
  914 void
  915 topo_init_root(struct topo_node *root)
  916 {
  917 
  918         topo_init_node(root);
  919         root->type = TOPO_TYPE_SYSTEM;
  920 }
  921 
  922 /*
  923  * Add a child node with the given ID under the given parent.
  924  * Do nothing if there is already a child with that ID.
  925  */
  926 struct topo_node *
  927 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
  928     topo_node_type type, uintptr_t subtype)
  929 {
  930         struct topo_node *node;
  931 
  932         TAILQ_FOREACH_REVERSE(node, &parent->children,
  933             topo_children, siblings) {
  934                 if (node->hwid == hwid
  935                     && node->type == type && node->subtype == subtype) {
  936                         return (node);
  937                 }
  938         }
  939 
  940         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
  941         topo_init_node(node);
  942         node->parent = parent;
  943         node->hwid = hwid;
  944         node->type = type;
  945         node->subtype = subtype;
  946         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
  947         parent->nchildren++;
  948 
  949         return (node);
  950 }
  951 
  952 /*
  953  * Find a child node with the given ID under the given parent.
  954  */
  955 struct topo_node *
  956 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
  957     topo_node_type type, uintptr_t subtype)
  958 {
  959 
  960         struct topo_node *node;
  961 
  962         TAILQ_FOREACH(node, &parent->children, siblings) {
  963                 if (node->hwid == hwid
  964                     && node->type == type && node->subtype == subtype) {
  965                         return (node);
  966                 }
  967         }
  968 
  969         return (NULL);
  970 }
  971 
  972 /*
  973  * Given a node change the order of its parent's child nodes such
  974  * that the node becomes the firt child while preserving the cyclic
  975  * order of the children.  In other words, the given node is promoted
  976  * by rotation.
  977  */
  978 void
  979 topo_promote_child(struct topo_node *child)
  980 {
  981         struct topo_node *next;
  982         struct topo_node *node;
  983         struct topo_node *parent;
  984 
  985         parent = child->parent;
  986         next = TAILQ_NEXT(child, siblings);
  987         TAILQ_REMOVE(&parent->children, child, siblings);
  988         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
  989 
  990         while (next != NULL) {
  991                 node = next;
  992                 next = TAILQ_NEXT(node, siblings);
  993                 TAILQ_REMOVE(&parent->children, node, siblings);
  994                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
  995                 child = node;
  996         }
  997 }
  998 
  999 /*
 1000  * Iterate to the next node in the depth-first search (traversal) of
 1001  * the topology tree.
 1002  */
 1003 struct topo_node *
 1004 topo_next_node(struct topo_node *top, struct topo_node *node)
 1005 {
 1006         struct topo_node *next;
 1007 
 1008         if ((next = TAILQ_FIRST(&node->children)) != NULL)
 1009                 return (next);
 1010 
 1011         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1012                 return (next);
 1013 
 1014         while ((node = node->parent) != top)
 1015                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1016                         return (next);
 1017 
 1018         return (NULL);
 1019 }
 1020 
 1021 /*
 1022  * Iterate to the next node in the depth-first search of the topology tree,
 1023  * but without descending below the current node.
 1024  */
 1025 struct topo_node *
 1026 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
 1027 {
 1028         struct topo_node *next;
 1029 
 1030         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1031                 return (next);
 1032 
 1033         while ((node = node->parent) != top)
 1034                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 1035                         return (next);
 1036 
 1037         return (NULL);
 1038 }
 1039 
 1040 /*
 1041  * Assign the given ID to the given topology node that represents a logical
 1042  * processor.
 1043  */
 1044 void
 1045 topo_set_pu_id(struct topo_node *node, cpuid_t id)
 1046 {
 1047 
 1048         KASSERT(node->type == TOPO_TYPE_PU,
 1049             ("topo_set_pu_id: wrong node type: %u", node->type));
 1050         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
 1051             ("topo_set_pu_id: cpuset already not empty"));
 1052         node->id = id;
 1053         CPU_SET(id, &node->cpuset);
 1054         node->cpu_count = 1;
 1055         node->subtype = 1;
 1056 
 1057         while ((node = node->parent) != NULL) {
 1058                 KASSERT(!CPU_ISSET(id, &node->cpuset),
 1059                     ("logical ID %u is already set in node %p", id, node));
 1060                 CPU_SET(id, &node->cpuset);
 1061                 node->cpu_count++;
 1062         }
 1063 }
 1064 
 1065 /*
 1066  * Check if the topology is uniform, that is, each package has the same number
 1067  * of cores in it and each core has the same number of threads (logical
 1068  * processors) in it.  If so, calculate the number of package, the number of
 1069  * cores per package and the number of logical processors per core.
 1070  * 'all' parameter tells whether to include administratively disabled logical
 1071  * processors into the analysis.
 1072  */
 1073 int
 1074 topo_analyze(struct topo_node *topo_root, int all,
 1075     int *pkg_count, int *cores_per_pkg, int *thrs_per_core)
 1076 {
 1077         struct topo_node *pkg_node;
 1078         struct topo_node *core_node;
 1079         struct topo_node *pu_node;
 1080         int thrs_per_pkg;
 1081         int cpp_counter;
 1082         int tpc_counter;
 1083         int tpp_counter;
 1084 
 1085         *pkg_count = 0;
 1086         *cores_per_pkg = -1;
 1087         *thrs_per_core = -1;
 1088         thrs_per_pkg = -1;
 1089         pkg_node = topo_root;
 1090         while (pkg_node != NULL) {
 1091                 if (pkg_node->type != TOPO_TYPE_PKG) {
 1092                         pkg_node = topo_next_node(topo_root, pkg_node);
 1093                         continue;
 1094                 }
 1095                 if (!all && CPU_EMPTY(&pkg_node->cpuset)) {
 1096                         pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
 1097                         continue;
 1098                 }
 1099 
 1100                 (*pkg_count)++;
 1101 
 1102                 cpp_counter = 0;
 1103                 tpp_counter = 0;
 1104                 core_node = pkg_node;
 1105                 while (core_node != NULL) {
 1106                         if (core_node->type == TOPO_TYPE_CORE) {
 1107                                 if (!all && CPU_EMPTY(&core_node->cpuset)) {
 1108                                         core_node =
 1109                                             topo_next_nonchild_node(pkg_node,
 1110                                                 core_node);
 1111                                         continue;
 1112                                 }
 1113 
 1114                                 cpp_counter++;
 1115 
 1116                                 tpc_counter = 0;
 1117                                 pu_node = core_node;
 1118                                 while (pu_node != NULL) {
 1119                                         if (pu_node->type == TOPO_TYPE_PU &&
 1120                                             (all || !CPU_EMPTY(&pu_node->cpuset)))
 1121                                                 tpc_counter++;
 1122                                         pu_node = topo_next_node(core_node,
 1123                                             pu_node);
 1124                                 }
 1125 
 1126                                 if (*thrs_per_core == -1)
 1127                                         *thrs_per_core = tpc_counter;
 1128                                 else if (*thrs_per_core != tpc_counter)
 1129                                         return (0);
 1130 
 1131                                 core_node = topo_next_nonchild_node(pkg_node,
 1132                                     core_node);
 1133                         } else {
 1134                                 /* PU node directly under PKG. */
 1135                                 if (core_node->type == TOPO_TYPE_PU &&
 1136                                    (all || !CPU_EMPTY(&core_node->cpuset)))
 1137                                         tpp_counter++;
 1138                                 core_node = topo_next_node(pkg_node,
 1139                                     core_node);
 1140                         }
 1141                 }
 1142 
 1143                 if (*cores_per_pkg == -1)
 1144                         *cores_per_pkg = cpp_counter;
 1145                 else if (*cores_per_pkg != cpp_counter)
 1146                         return (0);
 1147                 if (thrs_per_pkg == -1)
 1148                         thrs_per_pkg = tpp_counter;
 1149                 else if (thrs_per_pkg != tpp_counter)
 1150                         return (0);
 1151 
 1152                 pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
 1153         }
 1154 
 1155         KASSERT(*pkg_count > 0,
 1156                 ("bug in topology or analysis"));
 1157         if (*cores_per_pkg == 0) {
 1158                 KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0,
 1159                         ("bug in topology or analysis"));
 1160                 *thrs_per_core = thrs_per_pkg;
 1161         }
 1162 
 1163         return (1);
 1164 }
 1165 #endif /* SMP */
 1166
Cache object: 52a33b4993c922c76ad62e78582e99fb
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/subr_smp.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_smp.c