The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/common/disp/disp.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or http://www.opensolaris.org/os/licensing.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 
   26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
   27 /*        All Rights Reserved   */
   28 
   29 
   30 #include <sys/types.h>
   31 #include <sys/param.h>
   32 #include <sys/sysmacros.h>
   33 #include <sys/signal.h>
   34 #include <sys/user.h>
   35 #include <sys/systm.h>
   36 #include <sys/sysinfo.h>
   37 #include <sys/var.h>
   38 #include <sys/errno.h>
   39 #include <sys/cmn_err.h>
   40 #include <sys/debug.h>
   41 #include <sys/inline.h>
   42 #include <sys/disp.h>
   43 #include <sys/class.h>
   44 #include <sys/bitmap.h>
   45 #include <sys/kmem.h>
   46 #include <sys/cpuvar.h>
   47 #include <sys/vtrace.h>
   48 #include <sys/tnf.h>
   49 #include <sys/cpupart.h>
   50 #include <sys/lgrp.h>
   51 #include <sys/pg.h>
   52 #include <sys/cmt.h>
   53 #include <sys/bitset.h>
   54 #include <sys/schedctl.h>
   55 #include <sys/atomic.h>
   56 #include <sys/dtrace.h>
   57 #include <sys/sdt.h>
   58 #include <sys/archsystm.h>
   59 
   60 #include <vm/as.h>
   61 
   62 #define BOUND_CPU       0x1
   63 #define BOUND_PARTITION 0x2
   64 #define BOUND_INTR      0x4
   65 
   66 /* Dispatch queue allocation structure and functions */
   67 struct disp_queue_info {
   68         disp_t  *dp;
   69         dispq_t *olddispq;
   70         dispq_t *newdispq;
   71         ulong_t *olddqactmap;
   72         ulong_t *newdqactmap;
   73         int     oldnglobpris;
   74 };
   75 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
   76     disp_t *dp);
   77 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
   78 static void     disp_dq_free(struct disp_queue_info *dptr);
   79 
   80 /* platform-specific routine to call when processor is idle */
   81 static void     generic_idle_cpu();
   82 void            (*idle_cpu)() = generic_idle_cpu;
   83 
   84 /* routines invoked when a CPU enters/exits the idle loop */
   85 static void     idle_enter();
   86 static void     idle_exit();
   87 
   88 /* platform-specific routine to call when thread is enqueued */
   89 static void     generic_enq_thread(cpu_t *, int);
   90 void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
   91 
   92 pri_t   kpreemptpri;            /* priority where kernel preemption applies */
   93 pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
   94 pri_t   intr_pri;               /* interrupt thread priority base level */
   95 
   96 #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
   97 pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
   98 disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
   99 disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
  100 int     nswapped;               /* total number of swapped threads */
  101 void    disp_swapped_enq(kthread_t *tp);
  102 static void     disp_swapped_setrun(kthread_t *tp);
  103 static void     cpu_resched(cpu_t *cp, pri_t tpri);
  104 
  105 /*
  106  * If this is set, only interrupt threads will cause kernel preemptions.
  107  * This is done by changing the value of kpreemptpri.  kpreemptpri
  108  * will either be the max sysclass pri + 1 or the min interrupt pri.
  109  */
  110 int     only_intr_kpreempt;
  111 
  112 extern void set_idle_cpu(int cpun);
  113 extern void unset_idle_cpu(int cpun);
  114 static void setkpdq(kthread_t *tp, int borf);
  115 #define SETKP_BACK      0
  116 #define SETKP_FRONT     1
  117 /*
  118  * Parameter that determines how recently a thread must have run
  119  * on the CPU to be considered loosely-bound to that CPU to reduce
  120  * cold cache effects.  The interval is in hertz.
  121  */
  122 #define RECHOOSE_INTERVAL 3
  123 int     rechoose_interval = RECHOOSE_INTERVAL;
  124 
  125 /*
  126  * Parameter that determines how long (in nanoseconds) a thread must
  127  * be sitting on a run queue before it can be stolen by another CPU
  128  * to reduce migrations.  The interval is in nanoseconds.
  129  *
  130  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
  131  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
  132  * here indicating it is uninitiallized.
  133  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
  134  *
  135  */
  136 #define NOSTEAL_UNINITIALIZED   (-1)
  137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
  138 extern void cmp_set_nosteal_interval(void);
  139 
  140 id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
  141 
  142 disp_lock_t     transition_lock;        /* lock on transitioning threads */
  143 disp_lock_t     stop_lock;              /* lock on stopped threads */
  144 
  145 static void     cpu_dispqalloc(int numpris);
  146 
  147 /*
  148  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
  149  * a thread because it was sitting on its run queue for a very short
  150  * period of time.
  151  */
  152 #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
  153 
  154 static kthread_t        *disp_getwork(cpu_t *to);
  155 static kthread_t        *disp_getbest(disp_t *from);
  156 static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
  157 
  158 void    swtch_to(kthread_t *);
  159 
  160 /*
  161  * dispatcher and scheduler initialization
  162  */
  163 
  164 /*
  165  * disp_setup - Common code to calculate and allocate dispatcher
  166  *              variables and structures based on the maximum priority.
  167  */
  168 static void
  169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
  170 {
  171         pri_t   newnglobpris;
  172 
  173         ASSERT(MUTEX_HELD(&cpu_lock));
  174 
  175         newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
  176 
  177         if (newnglobpris > oldnglobpris) {
  178                 /*
  179                  * Allocate new kp queues for each CPU partition.
  180                  */
  181                 cpupart_kpqalloc(newnglobpris);
  182 
  183                 /*
  184                  * Allocate new dispatch queues for each CPU.
  185                  */
  186                 cpu_dispqalloc(newnglobpris);
  187 
  188                 /*
  189                  * compute new interrupt thread base priority
  190                  */
  191                 intr_pri = maxglobpri;
  192                 if (only_intr_kpreempt) {
  193                         kpreemptpri = intr_pri + 1;
  194                         if (kpqpri == KPQPRI)
  195                                 kpqpri = kpreemptpri;
  196                 }
  197                 v.v_nglobpris = newnglobpris;
  198         }
  199 }
  200 
  201 /*
  202  * dispinit - Called to initialize all loaded classes and the
  203  *            dispatcher framework.
  204  */
  205 void
  206 dispinit(void)
  207 {
  208         id_t    cid;
  209         pri_t   maxglobpri;
  210         pri_t   cl_maxglobpri;
  211 
  212         maxglobpri = -1;
  213 
  214         /*
  215          * Initialize transition lock, which will always be set.
  216          */
  217         DISP_LOCK_INIT(&transition_lock);
  218         disp_lock_enter_high(&transition_lock);
  219         DISP_LOCK_INIT(&stop_lock);
  220 
  221         mutex_enter(&cpu_lock);
  222         CPU->cpu_disp->disp_maxrunpri = -1;
  223         CPU->cpu_disp->disp_max_unbound_pri = -1;
  224 
  225         /*
  226          * Initialize the default CPU partition.
  227          */
  228         cpupart_initialize_default();
  229         /*
  230          * Call the class specific initialization functions for
  231          * all pre-installed schedulers.
  232          *
  233          * We pass the size of a class specific parameter
  234          * buffer to each of the initialization functions
  235          * to try to catch problems with backward compatibility
  236          * of class modules.
  237          *
  238          * For example a new class module running on an old system
  239          * which didn't provide sufficiently large parameter buffers
  240          * would be bad news. Class initialization modules can check for
  241          * this and take action if they detect a problem.
  242          */
  243 
  244         for (cid = 0; cid < nclass; cid++) {
  245                 sclass_t        *sc;
  246 
  247                 sc = &sclass[cid];
  248                 if (SCHED_INSTALLED(sc)) {
  249                         cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
  250                             &sc->cl_funcs);
  251                         if (cl_maxglobpri > maxglobpri)
  252                                 maxglobpri = cl_maxglobpri;
  253                 }
  254         }
  255         kpreemptpri = (pri_t)v.v_maxsyspri + 1;
  256         if (kpqpri == KPQPRI)
  257                 kpqpri = kpreemptpri;
  258 
  259         ASSERT(maxglobpri >= 0);
  260         disp_setup(maxglobpri, 0);
  261 
  262         mutex_exit(&cpu_lock);
  263 
  264         /*
  265          * Platform specific sticky scheduler setup.
  266          */
  267         if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
  268                 cmp_set_nosteal_interval();
  269 
  270         /*
  271          * Get the default class ID; this may be later modified via
  272          * dispadmin(1M).  This will load the class (normally TS) and that will
  273          * call disp_add(), which is why we had to drop cpu_lock first.
  274          */
  275         if (getcid(defaultclass, &defaultcid) != 0) {
  276                 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
  277                     defaultclass);
  278         }
  279 }
  280 
  281 /*
  282  * disp_add - Called with class pointer to initialize the dispatcher
  283  *            for a newly loaded class.
  284  */
  285 void
  286 disp_add(sclass_t *clp)
  287 {
  288         pri_t   maxglobpri;
  289         pri_t   cl_maxglobpri;
  290 
  291         mutex_enter(&cpu_lock);
  292         /*
  293          * Initialize the scheduler class.
  294          */
  295         maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
  296         cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
  297         if (cl_maxglobpri > maxglobpri)
  298                 maxglobpri = cl_maxglobpri;
  299 
  300         /*
  301          * Save old queue information.  Since we're initializing a
  302          * new scheduling class which has just been loaded, then
  303          * the size of the dispq may have changed.  We need to handle
  304          * that here.
  305          */
  306         disp_setup(maxglobpri, v.v_nglobpris);
  307 
  308         mutex_exit(&cpu_lock);
  309 }
  310 
  311 
  312 /*
  313  * For each CPU, allocate new dispatch queues
  314  * with the stated number of priorities.
  315  */
  316 static void
  317 cpu_dispqalloc(int numpris)
  318 {
  319         cpu_t   *cpup;
  320         struct disp_queue_info  *disp_mem;
  321         int i, num;
  322 
  323         ASSERT(MUTEX_HELD(&cpu_lock));
  324 
  325         disp_mem = kmem_zalloc(NCPU *
  326             sizeof (struct disp_queue_info), KM_SLEEP);
  327 
  328         /*
  329          * This routine must allocate all of the memory before stopping
  330          * the cpus because it must not sleep in kmem_alloc while the
  331          * CPUs are stopped.  Locks they hold will not be freed until they
  332          * are restarted.
  333          */
  334         i = 0;
  335         cpup = cpu_list;
  336         do {
  337                 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
  338                 i++;
  339                 cpup = cpup->cpu_next;
  340         } while (cpup != cpu_list);
  341         num = i;
  342 
  343         pause_cpus(NULL);
  344         for (i = 0; i < num; i++)
  345                 disp_dq_assign(&disp_mem[i], numpris);
  346         start_cpus();
  347 
  348         /*
  349          * I must free all of the memory after starting the cpus because
  350          * I can not risk sleeping in kmem_free while the cpus are stopped.
  351          */
  352         for (i = 0; i < num; i++)
  353                 disp_dq_free(&disp_mem[i]);
  354 
  355         kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
  356 }
  357 
  358 static void
  359 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
  360 {
  361         dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
  362         dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
  363             sizeof (long), KM_SLEEP);
  364         dptr->dp = dp;
  365 }
  366 
  367 static void
  368 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
  369 {
  370         disp_t  *dp;
  371 
  372         dp = dptr->dp;
  373         dptr->olddispq = dp->disp_q;
  374         dptr->olddqactmap = dp->disp_qactmap;
  375         dptr->oldnglobpris = dp->disp_npri;
  376 
  377         ASSERT(dptr->oldnglobpris < numpris);
  378 
  379         if (dptr->olddispq != NULL) {
  380                 /*
  381                  * Use kcopy because bcopy is platform-specific
  382                  * and could block while we might have paused the cpus.
  383                  */
  384                 (void) kcopy(dptr->olddispq, dptr->newdispq,
  385                     dptr->oldnglobpris * sizeof (dispq_t));
  386                 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
  387                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
  388                     sizeof (long));
  389         }
  390         dp->disp_q = dptr->newdispq;
  391         dp->disp_qactmap = dptr->newdqactmap;
  392         dp->disp_q_limit = &dptr->newdispq[numpris];
  393         dp->disp_npri = numpris;
  394 }
  395 
  396 static void
  397 disp_dq_free(struct disp_queue_info *dptr)
  398 {
  399         if (dptr->olddispq != NULL)
  400                 kmem_free(dptr->olddispq,
  401                     dptr->oldnglobpris * sizeof (dispq_t));
  402         if (dptr->olddqactmap != NULL)
  403                 kmem_free(dptr->olddqactmap,
  404                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
  405 }
  406 
  407 /*
  408  * For a newly created CPU, initialize the dispatch queue.
  409  * This is called before the CPU is known through cpu[] or on any lists.
  410  */
  411 void
  412 disp_cpu_init(cpu_t *cp)
  413 {
  414         disp_t  *dp;
  415         dispq_t *newdispq;
  416         ulong_t *newdqactmap;
  417 
  418         ASSERT(MUTEX_HELD(&cpu_lock));  /* protect dispatcher queue sizes */
  419 
  420         if (cp == cpu0_disp.disp_cpu)
  421                 dp = &cpu0_disp;
  422         else
  423                 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
  424         bzero(dp, sizeof (disp_t));
  425         cp->cpu_disp = dp;
  426         dp->disp_cpu = cp;
  427         dp->disp_maxrunpri = -1;
  428         dp->disp_max_unbound_pri = -1;
  429         DISP_LOCK_INIT(&cp->cpu_thread_lock);
  430         /*
  431          * Allocate memory for the dispatcher queue headers
  432          * and the active queue bitmap.
  433          */
  434         newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
  435         newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
  436             sizeof (long), KM_SLEEP);
  437         dp->disp_q = newdispq;
  438         dp->disp_qactmap = newdqactmap;
  439         dp->disp_q_limit = &newdispq[v.v_nglobpris];
  440         dp->disp_npri = v.v_nglobpris;
  441 }
  442 
  443 void
  444 disp_cpu_fini(cpu_t *cp)
  445 {
  446         ASSERT(MUTEX_HELD(&cpu_lock));
  447 
  448         disp_kp_free(cp->cpu_disp);
  449         if (cp->cpu_disp != &cpu0_disp)
  450                 kmem_free(cp->cpu_disp, sizeof (disp_t));
  451 }
  452 
  453 /*
  454  * Allocate new, larger kpreempt dispatch queue to replace the old one.
  455  */
  456 void
  457 disp_kp_alloc(disp_t *dq, pri_t npri)
  458 {
  459         struct disp_queue_info  mem_info;
  460 
  461         if (npri > dq->disp_npri) {
  462                 /*
  463                  * Allocate memory for the new array.
  464                  */
  465                 disp_dq_alloc(&mem_info, npri, dq);
  466 
  467                 /*
  468                  * We need to copy the old structures to the new
  469                  * and free the old.
  470                  */
  471                 disp_dq_assign(&mem_info, npri);
  472                 disp_dq_free(&mem_info);
  473         }
  474 }
  475 
  476 /*
  477  * Free dispatch queue.
  478  * Used for the kpreempt queues for a removed CPU partition and
  479  * for the per-CPU queues of deleted CPUs.
  480  */
  481 void
  482 disp_kp_free(disp_t *dq)
  483 {
  484         struct disp_queue_info  mem_info;
  485 
  486         mem_info.olddispq = dq->disp_q;
  487         mem_info.olddqactmap = dq->disp_qactmap;
  488         mem_info.oldnglobpris = dq->disp_npri;
  489         disp_dq_free(&mem_info);
  490 }
  491 
  492 /*
  493  * End dispatcher and scheduler initialization.
  494  */
  495 
  496 /*
  497  * See if there's anything to do other than remain idle.
  498  * Return non-zero if there is.
  499  *
  500  * This function must be called with high spl, or with
  501  * kernel preemption disabled to prevent the partition's
  502  * active cpu list from changing while being traversed.
  503  *
  504  * This is essentially a simpler version of disp_getwork()
  505  * to be called by CPUs preparing to "halt".
  506  */
  507 int
  508 disp_anywork(void)
  509 {
  510         cpu_t           *cp = CPU;
  511         cpu_t           *ocp;
  512         volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
  513 
  514         if (!(cp->cpu_flags & CPU_OFFLINE)) {
  515                 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
  516                         return (1);
  517 
  518                 for (ocp = cp->cpu_next_part; ocp != cp;
  519                     ocp = ocp->cpu_next_part) {
  520                         ASSERT(CPU_ACTIVE(ocp));
  521 
  522                         /*
  523                          * Something has appeared on the local run queue.
  524                          */
  525                         if (*local_nrunnable > 0)
  526                                 return (1);
  527                         /*
  528                          * If we encounter another idle CPU that will
  529                          * soon be trolling around through disp_anywork()
  530                          * terminate our walk here and let this other CPU
  531                          * patrol the next part of the list.
  532                          */
  533                         if (ocp->cpu_dispatch_pri == -1 &&
  534                             (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
  535                                 return (0);
  536                         /*
  537                          * Work can be taken from another CPU if:
  538                          *      - There is unbound work on the run queue
  539                          *      - That work isn't a thread undergoing a
  540                          *      - context switch on an otherwise empty queue.
  541                          *      - The CPU isn't running the idle loop.
  542                          */
  543                         if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
  544                             !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
  545                             ocp->cpu_disp->disp_nrunnable == 1) &&
  546                             ocp->cpu_dispatch_pri != -1)
  547                                 return (1);
  548                 }
  549         }
  550         return (0);
  551 }
  552 
  553 /*
  554  * Called when CPU enters the idle loop
  555  */
  556 static void
  557 idle_enter()
  558 {
  559         cpu_t           *cp = CPU;
  560 
  561         new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
  562         CPU_STATS_ADDQ(cp, sys, idlethread, 1);
  563         set_idle_cpu(cp->cpu_id);       /* arch-dependent hook */
  564 }
  565 
  566 /*
  567  * Called when CPU exits the idle loop
  568  */
  569 static void
  570 idle_exit()
  571 {
  572         cpu_t           *cp = CPU;
  573 
  574         new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
  575         unset_idle_cpu(cp->cpu_id);     /* arch-dependent hook */
  576 }
  577 
  578 /*
  579  * Idle loop.
  580  */
  581 void
  582 idle()
  583 {
  584         struct cpu      *cp = CPU;              /* pointer to this CPU */
  585         kthread_t       *t;                     /* taken thread */
  586 
  587         idle_enter();
  588 
  589         /*
  590          * Uniprocessor version of idle loop.
  591          * Do this until notified that we're on an actual multiprocessor.
  592          */
  593         while (ncpus == 1) {
  594                 if (cp->cpu_disp->disp_nrunnable == 0) {
  595                         (*idle_cpu)();
  596                         continue;
  597                 }
  598                 idle_exit();
  599                 swtch();
  600 
  601                 idle_enter(); /* returned from swtch */
  602         }
  603 
  604         /*
  605          * Multiprocessor idle loop.
  606          */
  607         for (;;) {
  608                 /*
  609                  * If CPU is completely quiesced by p_online(2), just wait
  610                  * here with minimal bus traffic until put online.
  611                  */
  612                 while (cp->cpu_flags & CPU_QUIESCED)
  613                         (*idle_cpu)();
  614 
  615                 if (cp->cpu_disp->disp_nrunnable != 0) {
  616                         idle_exit();
  617                         swtch();
  618                 } else {
  619                         if (cp->cpu_flags & CPU_OFFLINE)
  620                                 continue;
  621                         if ((t = disp_getwork(cp)) == NULL) {
  622                                 if (cp->cpu_chosen_level != -1) {
  623                                         disp_t *dp = cp->cpu_disp;
  624                                         disp_t *kpq;
  625 
  626                                         disp_lock_enter(&dp->disp_lock);
  627                                         /*
  628                                          * Set kpq under lock to prevent
  629                                          * migration between partitions.
  630                                          */
  631                                         kpq = &cp->cpu_part->cp_kp_queue;
  632                                         if (kpq->disp_maxrunpri == -1)
  633                                                 cp->cpu_chosen_level = -1;
  634                                         disp_lock_exit(&dp->disp_lock);
  635                                 }
  636                                 (*idle_cpu)();
  637                                 continue;
  638                         }
  639                         /*
  640                          * If there was a thread but we couldn't steal
  641                          * it, then keep trying.
  642                          */
  643                         if (t == T_DONTSTEAL)
  644                                 continue;
  645                         idle_exit();
  646                         swtch_to(t);
  647                 }
  648                 idle_enter(); /* returned from swtch/swtch_to */
  649         }
  650 }
  651 
  652 
  653 /*
  654  * Preempt the currently running thread in favor of the highest
  655  * priority thread.  The class of the current thread controls
  656  * where it goes on the dispatcher queues. If panicking, turn
  657  * preemption off.
  658  */
  659 void
  660 preempt()
  661 {
  662         kthread_t       *t = curthread;
  663         klwp_t          *lwp = ttolwp(curthread);
  664 
  665         if (panicstr)
  666                 return;
  667 
  668         TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
  669 
  670         thread_lock(t);
  671 
  672         if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
  673                 /*
  674                  * this thread has already been chosen to be run on
  675                  * another CPU. Clear kprunrun on this CPU since we're
  676                  * already headed for swtch().
  677                  */
  678                 CPU->cpu_kprunrun = 0;
  679                 thread_unlock_nopreempt(t);
  680                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
  681         } else {
  682                 if (lwp != NULL)
  683                         lwp->lwp_ru.nivcsw++;
  684                 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
  685                 THREAD_TRANSITION(t);
  686                 CL_PREEMPT(t);
  687                 DTRACE_SCHED(preempt);
  688                 thread_unlock_nopreempt(t);
  689 
  690                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
  691 
  692                 swtch();                /* clears CPU->cpu_runrun via disp() */
  693         }
  694 }
  695 
  696 extern kthread_t *thread_unpin();
  697 
  698 /*
  699  * disp() - find the highest priority thread for this processor to run, and
  700  * set it in TS_ONPROC state so that resume() can be called to run it.
  701  */
  702 static kthread_t *
  703 disp()
  704 {
  705         cpu_t           *cpup;
  706         disp_t          *dp;
  707         kthread_t       *tp;
  708         dispq_t         *dq;
  709         int             maxrunword;
  710         pri_t           pri;
  711         disp_t          *kpq;
  712 
  713         TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
  714 
  715         cpup = CPU;
  716         /*
  717          * Find the highest priority loaded, runnable thread.
  718          */
  719         dp = cpup->cpu_disp;
  720 
  721 reschedule:
  722         /*
  723          * If there is more important work on the global queue with a better
  724          * priority than the maximum on this CPU, take it now.
  725          */
  726         kpq = &cpup->cpu_part->cp_kp_queue;
  727         while ((pri = kpq->disp_maxrunpri) >= 0 &&
  728             pri >= dp->disp_maxrunpri &&
  729             (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
  730             (tp = disp_getbest(kpq)) != NULL) {
  731                 if (disp_ratify(tp, kpq) != NULL) {
  732                         TRACE_1(TR_FAC_DISP, TR_DISP_END,
  733                             "disp_end:tid %p", tp);
  734                         return (tp);
  735                 }
  736         }
  737 
  738         disp_lock_enter(&dp->disp_lock);
  739         pri = dp->disp_maxrunpri;
  740 
  741         /*
  742          * If there is nothing to run, look at what's runnable on other queues.
  743          * Choose the idle thread if the CPU is quiesced.
  744          * Note that CPUs that have the CPU_OFFLINE flag set can still run
  745          * interrupt threads, which will be the only threads on the CPU's own
  746          * queue, but cannot run threads from other queues.
  747          */
  748         if (pri == -1) {
  749                 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
  750                         disp_lock_exit(&dp->disp_lock);
  751                         if ((tp = disp_getwork(cpup)) == NULL ||
  752                             tp == T_DONTSTEAL) {
  753                                 tp = cpup->cpu_idle_thread;
  754                                 (void) splhigh();
  755                                 THREAD_ONPROC(tp, cpup);
  756                                 cpup->cpu_dispthread = tp;
  757                                 cpup->cpu_dispatch_pri = -1;
  758                                 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
  759                                 cpup->cpu_chosen_level = -1;
  760                         }
  761                 } else {
  762                         disp_lock_exit_high(&dp->disp_lock);
  763                         tp = cpup->cpu_idle_thread;
  764                         THREAD_ONPROC(tp, cpup);
  765                         cpup->cpu_dispthread = tp;
  766                         cpup->cpu_dispatch_pri = -1;
  767                         cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
  768                         cpup->cpu_chosen_level = -1;
  769                 }
  770                 TRACE_1(TR_FAC_DISP, TR_DISP_END,
  771                     "disp_end:tid %p", tp);
  772                 return (tp);
  773         }
  774 
  775         dq = &dp->disp_q[pri];
  776         tp = dq->dq_first;
  777 
  778         ASSERT(tp != NULL);
  779         ASSERT(tp->t_schedflag & TS_LOAD);      /* thread must be swapped in */
  780 
  781         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
  782 
  783         /*
  784          * Found it so remove it from queue.
  785          */
  786         dp->disp_nrunnable--;
  787         dq->dq_sruncnt--;
  788         if ((dq->dq_first = tp->t_link) == NULL) {
  789                 ulong_t *dqactmap = dp->disp_qactmap;
  790 
  791                 ASSERT(dq->dq_sruncnt == 0);
  792                 dq->dq_last = NULL;
  793 
  794                 /*
  795                  * The queue is empty, so the corresponding bit needs to be
  796                  * turned off in dqactmap.   If nrunnable != 0 just took the
  797                  * last runnable thread off the
  798                  * highest queue, so recompute disp_maxrunpri.
  799                  */
  800                 maxrunword = pri >> BT_ULSHIFT;
  801                 dqactmap[maxrunword] &= ~BT_BIW(pri);
  802 
  803                 if (dp->disp_nrunnable == 0) {
  804                         dp->disp_max_unbound_pri = -1;
  805                         dp->disp_maxrunpri = -1;
  806                 } else {
  807                         int ipri;
  808 
  809                         ipri = bt_gethighbit(dqactmap, maxrunword);
  810                         dp->disp_maxrunpri = ipri;
  811                         if (ipri < dp->disp_max_unbound_pri)
  812                                 dp->disp_max_unbound_pri = ipri;
  813                 }
  814         } else {
  815                 tp->t_link = NULL;
  816         }
  817 
  818         /*
  819          * Set TS_DONT_SWAP flag to prevent another processor from swapping
  820          * out this thread before we have a chance to run it.
  821          * While running, it is protected against swapping by t_lock.
  822          */
  823         tp->t_schedflag |= TS_DONT_SWAP;
  824         cpup->cpu_dispthread = tp;              /* protected by spl only */
  825         cpup->cpu_dispatch_pri = pri;
  826         ASSERT(pri == DISP_PRIO(tp));
  827         thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
  828         disp_lock_exit_high(&dp->disp_lock);    /* drop run queue lock */
  829 
  830         ASSERT(tp != NULL);
  831         TRACE_1(TR_FAC_DISP, TR_DISP_END,
  832             "disp_end:tid %p", tp);
  833 
  834         if (disp_ratify(tp, kpq) == NULL)
  835                 goto reschedule;
  836 
  837         return (tp);
  838 }
  839 
  840 /*
  841  * swtch()
  842  *      Find best runnable thread and run it.
  843  *      Called with the current thread already switched to a new state,
  844  *      on a sleep queue, run queue, stopped, and not zombied.
  845  *      May be called at any spl level less than or equal to LOCK_LEVEL.
  846  *      Always drops spl to the base level (spl0()).
  847  */
  848 void
  849 swtch()
  850 {
  851         kthread_t       *t = curthread;
  852         kthread_t       *next;
  853         cpu_t           *cp;
  854 
  855         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
  856 
  857         if (t->t_flag & T_INTR_THREAD)
  858                 cpu_intr_swtch_enter(t);
  859 
  860         if (t->t_intr != NULL) {
  861                 /*
  862                  * We are an interrupt thread.  Setup and return
  863                  * the interrupted thread to be resumed.
  864                  */
  865                 (void) splhigh();       /* block other scheduler action */
  866                 cp = CPU;               /* now protected against migration */
  867                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
  868                 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
  869                 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
  870                 next = thread_unpin();
  871                 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
  872                 resume_from_intr(next);
  873         } else {
  874 #ifdef  DEBUG
  875                 if (t->t_state == TS_ONPROC &&
  876                     t->t_disp_queue->disp_cpu == CPU &&
  877                     t->t_preempt == 0) {
  878                         thread_lock(t);
  879                         ASSERT(t->t_state != TS_ONPROC ||
  880                             t->t_disp_queue->disp_cpu != CPU ||
  881                             t->t_preempt != 0); /* cannot migrate */
  882                         thread_unlock_nopreempt(t);
  883                 }
  884 #endif  /* DEBUG */
  885                 cp = CPU;
  886                 next = disp();          /* returns with spl high */
  887                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
  888 
  889                 /* OK to steal anything left on run queue */
  890                 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
  891 
  892                 if (next != t) {
  893                         hrtime_t now;
  894 
  895                         now = gethrtime_unscaled();
  896                         pg_ev_thread_swtch(cp, now, t, next);
  897 
  898                         /*
  899                          * If t was previously in the TS_ONPROC state,
  900                          * setfrontdq and setbackdq won't have set its t_waitrq.
  901                          * Since we now finally know that we're switching away
  902                          * from this thread, set its t_waitrq if it is on a run
  903                          * queue.
  904                          */
  905                         if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
  906                                 t->t_waitrq = now;
  907                         }
  908 
  909                         /*
  910                          * restore mstate of thread that we are switching to
  911                          */
  912                         restore_mstate(next);
  913 
  914                         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
  915                         cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
  916                         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
  917 
  918                         if (dtrace_vtime_active)
  919                                 dtrace_vtime_switch(next);
  920 
  921                         resume(next);
  922                         /*
  923                          * The TR_RESUME_END and TR_SWTCH_END trace points
  924                          * appear at the end of resume(), because we may not
  925                          * return here
  926                          */
  927                 } else {
  928                         if (t->t_flag & T_INTR_THREAD)
  929                                 cpu_intr_swtch_exit(t);
  930                         /*
  931                          * Threads that enqueue themselves on a run queue defer
  932                          * setting t_waitrq. It is then either set in swtch()
  933                          * when the CPU is actually yielded, or not at all if it
  934                          * is remaining on the CPU.
  935                          * There is however a window between where the thread
  936                          * placed itself on a run queue, and where it selects
  937                          * itself in disp(), where a third party (eg. clock()
  938                          * doing tick processing) may have re-enqueued this
  939                          * thread, setting t_waitrq in the process. We detect
  940                          * this race by noticing that despite switching to
  941                          * ourself, our t_waitrq has been set, and should be
  942                          * cleared.
  943                          */
  944                         if (t->t_waitrq != 0)
  945                                 t->t_waitrq = 0;
  946 
  947                         pg_ev_thread_remain(cp, t);
  948 
  949                         DTRACE_SCHED(remain__cpu);
  950                         TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
  951                         (void) spl0();
  952                 }
  953         }
  954 }
  955 
  956 /*
  957  * swtch_from_zombie()
  958  *      Special case of swtch(), which allows checks for TS_ZOMB to be
  959  *      eliminated from normal resume.
  960  *      Find best runnable thread and run it.
  961  *      Called with the current thread zombied.
  962  *      Zombies cannot migrate, so CPU references are safe.
  963  */
  964 void
  965 swtch_from_zombie()
  966 {
  967         kthread_t       *next;
  968         cpu_t           *cpu = CPU;
  969 
  970         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
  971 
  972         ASSERT(curthread->t_state == TS_ZOMB);
  973 
  974         next = disp();                  /* returns with spl high */
  975         ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
  976         CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
  977         ASSERT(next != curthread);
  978         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
  979 
  980         pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
  981 
  982         restore_mstate(next);
  983 
  984         if (dtrace_vtime_active)
  985                 dtrace_vtime_switch(next);
  986 
  987         resume_from_zombie(next);
  988         /*
  989          * The TR_RESUME_END and TR_SWTCH_END trace points
  990          * appear at the end of resume(), because we certainly will not
  991          * return here
  992          */
  993 }
  994 
  995 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
  996 
  997 /*
  998  * search_disp_queues()
  999  *      Search the given dispatch queues for thread tp.
 1000  *      Return 1 if tp is found, otherwise return 0.
 1001  */
 1002 static int
 1003 search_disp_queues(disp_t *dp, kthread_t *tp)
 1004 {
 1005         dispq_t         *dq;
 1006         dispq_t         *eq;
 1007 
 1008         disp_lock_enter_high(&dp->disp_lock);
 1009 
 1010         for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
 1011                 kthread_t       *rp;
 1012 
 1013                 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
 1014 
 1015                 for (rp = dq->dq_first; rp; rp = rp->t_link)
 1016                         if (tp == rp) {
 1017                                 disp_lock_exit_high(&dp->disp_lock);
 1018                                 return (1);
 1019                         }
 1020         }
 1021         disp_lock_exit_high(&dp->disp_lock);
 1022 
 1023         return (0);
 1024 }
 1025 
 1026 /*
 1027  * thread_on_queue()
 1028  *      Search all per-CPU dispatch queues and all partition-wide kpreempt
 1029  *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
 1030  */
 1031 static int
 1032 thread_on_queue(kthread_t *tp)
 1033 {
 1034         cpu_t           *cp;
 1035         struct cpupart  *part;
 1036 
 1037         ASSERT(getpil() >= DISP_LEVEL);
 1038 
 1039         /*
 1040          * Search the per-CPU dispatch queues for tp.
 1041          */
 1042         cp = CPU;
 1043         do {
 1044                 if (search_disp_queues(cp->cpu_disp, tp))
 1045                         return (1);
 1046         } while ((cp = cp->cpu_next_onln) != CPU);
 1047 
 1048         /*
 1049          * Search the partition-wide kpreempt queues for tp.
 1050          */
 1051         part = CPU->cpu_part;
 1052         do {
 1053                 if (search_disp_queues(&part->cp_kp_queue, tp))
 1054                         return (1);
 1055         } while ((part = part->cp_next) != CPU->cpu_part);
 1056 
 1057         return (0);
 1058 }
 1059 
 1060 #else
 1061 
 1062 #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
 1063 
 1064 #endif  /* DEBUG */
 1065 
 1066 /*
 1067  * like swtch(), but switch to a specified thread taken from another CPU.
 1068  *      called with spl high..
 1069  */
 1070 void
 1071 swtch_to(kthread_t *next)
 1072 {
 1073         cpu_t                   *cp = CPU;
 1074         hrtime_t                now;
 1075 
 1076         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 1077 
 1078         /*
 1079          * Update context switch statistics.
 1080          */
 1081         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 1082 
 1083         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 1084 
 1085         now = gethrtime_unscaled();
 1086         pg_ev_thread_swtch(cp, now, curthread, next);
 1087 
 1088         /* OK to steal anything left on run queue */
 1089         cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 1090 
 1091         /* record last execution time */
 1092         cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
 1093 
 1094         /*
 1095          * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
 1096          * won't have set its t_waitrq.  Since we now finally know that we're
 1097          * switching away from this thread, set its t_waitrq if it is on a run
 1098          * queue.
 1099          */
 1100         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
 1101                 curthread->t_waitrq = now;
 1102         }
 1103 
 1104         /* restore next thread to previously running microstate */
 1105         restore_mstate(next);
 1106 
 1107         if (dtrace_vtime_active)
 1108                 dtrace_vtime_switch(next);
 1109 
 1110         resume(next);
 1111         /*
 1112          * The TR_RESUME_END and TR_SWTCH_END trace points
 1113          * appear at the end of resume(), because we may not
 1114          * return here
 1115          */
 1116 }
 1117 
 1118 #define CPU_IDLING(pri) ((pri) == -1)
 1119 
 1120 static void
 1121 cpu_resched(cpu_t *cp, pri_t tpri)
 1122 {
 1123         int     call_poke_cpu = 0;
 1124         pri_t   cpupri = cp->cpu_dispatch_pri;
 1125 
 1126         if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
 1127                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
 1128                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
 1129                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
 1130                         cp->cpu_runrun = 1;
 1131                         aston(cp->cpu_dispthread);
 1132                         if (tpri < kpreemptpri && cp != CPU)
 1133                                 call_poke_cpu = 1;
 1134                 }
 1135                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
 1136                         cp->cpu_kprunrun = 1;
 1137                         if (cp != CPU)
 1138                                 call_poke_cpu = 1;
 1139                 }
 1140         }
 1141 
 1142         /*
 1143          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
 1144          */
 1145         membar_enter();
 1146 
 1147         if (call_poke_cpu)
 1148                 poke_cpu(cp->cpu_id);
 1149 }
 1150 
 1151 /*
 1152  * setbackdq() keeps runqs balanced such that the difference in length
 1153  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
 1154  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
 1155  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
 1156  * try to keep runqs perfectly balanced regardless of the thread priority.
 1157  */
 1158 #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
 1159 #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
 1160 #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
 1161 
 1162 /*
 1163  * Macro that evaluates to true if it is likely that the thread has cache
 1164  * warmth. This is based on the amount of time that has elapsed since the
 1165  * thread last ran. If that amount of time is less than "rechoose_interval"
 1166  * ticks, then we decide that the thread has enough cache warmth to warrant
 1167  * some affinity for t->t_cpu.
 1168  */
 1169 #define THREAD_HAS_CACHE_WARMTH(thread) \
 1170         ((thread == curthread) ||       \
 1171         ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
 1172 /*
 1173  * Put the specified thread on the back of the dispatcher
 1174  * queue corresponding to its current priority.
 1175  *
 1176  * Called with the thread in transition, onproc or stopped state
 1177  * and locked (transition implies locked) and at high spl.
 1178  * Returns with the thread in TS_RUN state and still locked.
 1179  */
 1180 void
 1181 setbackdq(kthread_t *tp)
 1182 {
 1183         dispq_t *dq;
 1184         disp_t          *dp;
 1185         cpu_t           *cp;
 1186         pri_t           tpri;
 1187         int             bound;
 1188         boolean_t       self;
 1189 
 1190         ASSERT(THREAD_LOCK_HELD(tp));
 1191         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
 1192         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
 1193 
 1194         /*
 1195          * If thread is "swapped" or on the swap queue don't
 1196          * queue it, but wake sched.
 1197          */
 1198         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
 1199                 disp_swapped_setrun(tp);
 1200                 return;
 1201         }
 1202 
 1203         self = (tp == curthread);
 1204 
 1205         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
 1206                 bound = 1;
 1207         else
 1208                 bound = 0;
 1209 
 1210         tpri = DISP_PRIO(tp);
 1211         if (ncpus == 1)
 1212                 cp = tp->t_cpu;
 1213         else if (!bound) {
 1214                 if (tpri >= kpqpri) {
 1215                         setkpdq(tp, SETKP_BACK);
 1216                         return;
 1217                 }
 1218 
 1219                 /*
 1220                  * We'll generally let this thread continue to run where
 1221                  * it last ran...but will consider migration if:
 1222                  * - We thread probably doesn't have much cache warmth.
 1223                  * - The CPU where it last ran is the target of an offline
 1224                  *   request.
 1225                  * - The thread last ran outside it's home lgroup.
 1226                  */
 1227                 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
 1228                     (tp->t_cpu == cpu_inmotion)) {
 1229                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
 1230                 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
 1231                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
 1232                             self ? tp->t_cpu : NULL);
 1233                 } else {
 1234                         cp = tp->t_cpu;
 1235                 }
 1236 
 1237                 if (tp->t_cpupart == cp->cpu_part) {
 1238                         int     qlen;
 1239 
 1240                         /*
 1241                          * Perform any CMT load balancing
 1242                          */
 1243                         cp = cmt_balance(tp, cp);
 1244 
 1245                         /*
 1246                          * Balance across the run queues
 1247                          */
 1248                         qlen = RUNQ_LEN(cp, tpri);
 1249                         if (tpri >= RUNQ_MATCH_PRI &&
 1250                             !(tp->t_schedflag & TS_RUNQMATCH))
 1251                                 qlen -= RUNQ_MAX_DIFF;
 1252                         if (qlen > 0) {
 1253                                 cpu_t *newcp;
 1254 
 1255                                 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
 1256                                         newcp = cp->cpu_next_part;
 1257                                 } else if ((newcp = cp->cpu_next_lpl) == cp) {
 1258                                         newcp = cp->cpu_next_part;
 1259                                 }
 1260 
 1261                                 if (RUNQ_LEN(newcp, tpri) < qlen) {
 1262                                         DTRACE_PROBE3(runq__balance,
 1263                                             kthread_t *, tp,
 1264                                             cpu_t *, cp, cpu_t *, newcp);
 1265                                         cp = newcp;
 1266                                 }
 1267                         }
 1268                 } else {
 1269                         /*
 1270                          * Migrate to a cpu in the new partition.
 1271                          */
 1272                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
 1273                             tp->t_lpl, tp->t_pri, NULL);
 1274                 }
 1275                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 1276         } else {
 1277                 /*
 1278                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
 1279                  * a short time until weak binding that existed when the
 1280                  * strong binding was established has dropped) so we must
 1281                  * favour weak binding over strong.
 1282                  */
 1283                 cp = tp->t_weakbound_cpu ?
 1284                     tp->t_weakbound_cpu : tp->t_bound_cpu;
 1285         }
 1286         /*
 1287          * A thread that is ONPROC may be temporarily placed on the run queue
 1288          * but then chosen to run again by disp.  If the thread we're placing on
 1289          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
 1290          * replacement process is actually scheduled in swtch().  In this
 1291          * situation, curthread is the only thread that could be in the ONPROC
 1292          * state.
 1293          */
 1294         if ((!self) && (tp->t_waitrq == 0)) {
 1295                 hrtime_t curtime;
 1296 
 1297                 curtime = gethrtime_unscaled();
 1298                 (void) cpu_update_pct(tp, curtime);
 1299                 tp->t_waitrq = curtime;
 1300         } else {
 1301                 (void) cpu_update_pct(tp, gethrtime_unscaled());
 1302         }
 1303 
 1304         dp = cp->cpu_disp;
 1305         disp_lock_enter_high(&dp->disp_lock);
 1306 
 1307         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
 1308         TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
 1309             tpri, cp, tp);
 1310 
 1311 #ifndef NPROBE
 1312         /* Kernel probe */
 1313         if (tnf_tracing_active)
 1314                 tnf_thread_queue(tp, cp, tpri);
 1315 #endif /* NPROBE */
 1316 
 1317         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
 1318 
 1319         THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
 1320         tp->t_disp_queue = dp;
 1321         tp->t_link = NULL;
 1322 
 1323         dq = &dp->disp_q[tpri];
 1324         dp->disp_nrunnable++;
 1325         if (!bound)
 1326                 dp->disp_steal = 0;
 1327         membar_enter();
 1328 
 1329         if (dq->dq_sruncnt++ != 0) {
 1330                 ASSERT(dq->dq_first != NULL);
 1331                 dq->dq_last->t_link = tp;
 1332                 dq->dq_last = tp;
 1333         } else {
 1334                 ASSERT(dq->dq_first == NULL);
 1335                 ASSERT(dq->dq_last == NULL);
 1336                 dq->dq_first = dq->dq_last = tp;
 1337                 BT_SET(dp->disp_qactmap, tpri);
 1338                 if (tpri > dp->disp_maxrunpri) {
 1339                         dp->disp_maxrunpri = tpri;
 1340                         membar_enter();
 1341                         cpu_resched(cp, tpri);
 1342                 }
 1343         }
 1344 
 1345         if (!bound && tpri > dp->disp_max_unbound_pri) {
 1346                 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
 1347                         /*
 1348                          * If there are no other unbound threads on the
 1349                          * run queue, don't allow other CPUs to steal
 1350                          * this thread while we are in the middle of a
 1351                          * context switch. We may just switch to it
 1352                          * again right away. CPU_DISP_DONTSTEAL is cleared
 1353                          * in swtch and swtch_to.
 1354                          */
 1355                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
 1356                 }
 1357                 dp->disp_max_unbound_pri = tpri;
 1358         }
 1359         (*disp_enq_thread)(cp, bound);
 1360 }
 1361 
 1362 /*
 1363  * Put the specified thread on the front of the dispatcher
 1364  * queue corresponding to its current priority.
 1365  *
 1366  * Called with the thread in transition, onproc or stopped state
 1367  * and locked (transition implies locked) and at high spl.
 1368  * Returns with the thread in TS_RUN state and still locked.
 1369  */
 1370 void
 1371 setfrontdq(kthread_t *tp)
 1372 {
 1373         disp_t          *dp;
 1374         dispq_t         *dq;
 1375         cpu_t           *cp;
 1376         pri_t           tpri;
 1377         int             bound;
 1378 
 1379         ASSERT(THREAD_LOCK_HELD(tp));
 1380         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
 1381         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
 1382 
 1383         /*
 1384          * If thread is "swapped" or on the swap queue don't
 1385          * queue it, but wake sched.
 1386          */
 1387         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
 1388                 disp_swapped_setrun(tp);
 1389                 return;
 1390         }
 1391 
 1392         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
 1393                 bound = 1;
 1394         else
 1395                 bound = 0;
 1396 
 1397         tpri = DISP_PRIO(tp);
 1398         if (ncpus == 1)
 1399                 cp = tp->t_cpu;
 1400         else if (!bound) {
 1401                 if (tpri >= kpqpri) {
 1402                         setkpdq(tp, SETKP_FRONT);
 1403                         return;
 1404                 }
 1405                 cp = tp->t_cpu;
 1406                 if (tp->t_cpupart == cp->cpu_part) {
 1407                         /*
 1408                          * We'll generally let this thread continue to run
 1409                          * where it last ran, but will consider migration if:
 1410                          * - The thread last ran outside it's home lgroup.
 1411                          * - The CPU where it last ran is the target of an
 1412                          *   offline request (a thread_nomigrate() on the in
 1413                          *   motion CPU relies on this when forcing a preempt).
 1414                          * - The thread isn't the highest priority thread where
 1415                          *   it last ran, and it is considered not likely to
 1416                          *   have significant cache warmth.
 1417                          */
 1418                         if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
 1419                             (cp == cpu_inmotion)) {
 1420                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
 1421                                     (tp == curthread) ? cp : NULL);
 1422                         } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
 1423                             (!THREAD_HAS_CACHE_WARMTH(tp))) {
 1424                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
 1425                                     NULL);
 1426                         }
 1427                 } else {
 1428                         /*
 1429                          * Migrate to a cpu in the new partition.
 1430                          */
 1431                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
 1432                             tp->t_lpl, tp->t_pri, NULL);
 1433                 }
 1434                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 1435         } else {
 1436                 /*
 1437                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
 1438                  * a short time until weak binding that existed when the
 1439                  * strong binding was established has dropped) so we must
 1440                  * favour weak binding over strong.
 1441                  */
 1442                 cp = tp->t_weakbound_cpu ?
 1443                     tp->t_weakbound_cpu : tp->t_bound_cpu;
 1444         }
 1445 
 1446         /*
 1447          * A thread that is ONPROC may be temporarily placed on the run queue
 1448          * but then chosen to run again by disp.  If the thread we're placing on
 1449          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
 1450          * replacement process is actually scheduled in swtch().  In this
 1451          * situation, curthread is the only thread that could be in the ONPROC
 1452          * state.
 1453          */
 1454         if ((tp != curthread) && (tp->t_waitrq == 0)) {
 1455                 hrtime_t curtime;
 1456 
 1457                 curtime = gethrtime_unscaled();
 1458                 (void) cpu_update_pct(tp, curtime);
 1459                 tp->t_waitrq = curtime;
 1460         } else {
 1461                 (void) cpu_update_pct(tp, gethrtime_unscaled());
 1462         }
 1463 
 1464         dp = cp->cpu_disp;
 1465         disp_lock_enter_high(&dp->disp_lock);
 1466 
 1467         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
 1468         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
 1469 
 1470 #ifndef NPROBE
 1471         /* Kernel probe */
 1472         if (tnf_tracing_active)
 1473                 tnf_thread_queue(tp, cp, tpri);
 1474 #endif /* NPROBE */
 1475 
 1476         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
 1477 
 1478         THREAD_RUN(tp, &dp->disp_lock);         /* set TS_RUN state and lock */
 1479         tp->t_disp_queue = dp;
 1480 
 1481         dq = &dp->disp_q[tpri];
 1482         dp->disp_nrunnable++;
 1483         if (!bound)
 1484                 dp->disp_steal = 0;
 1485         membar_enter();
 1486 
 1487         if (dq->dq_sruncnt++ != 0) {
 1488                 ASSERT(dq->dq_last != NULL);
 1489                 tp->t_link = dq->dq_first;
 1490                 dq->dq_first = tp;
 1491         } else {
 1492                 ASSERT(dq->dq_last == NULL);
 1493                 ASSERT(dq->dq_first == NULL);
 1494                 tp->t_link = NULL;
 1495                 dq->dq_first = dq->dq_last = tp;
 1496                 BT_SET(dp->disp_qactmap, tpri);
 1497                 if (tpri > dp->disp_maxrunpri) {
 1498                         dp->disp_maxrunpri = tpri;
 1499                         membar_enter();
 1500                         cpu_resched(cp, tpri);
 1501                 }
 1502         }
 1503 
 1504         if (!bound && tpri > dp->disp_max_unbound_pri) {
 1505                 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
 1506                     cp == CPU) {
 1507                         /*
 1508                          * If there are no other unbound threads on the
 1509                          * run queue, don't allow other CPUs to steal
 1510                          * this thread while we are in the middle of a
 1511                          * context switch. We may just switch to it
 1512                          * again right away. CPU_DISP_DONTSTEAL is cleared
 1513                          * in swtch and swtch_to.
 1514                          */
 1515                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
 1516                 }
 1517                 dp->disp_max_unbound_pri = tpri;
 1518         }
 1519         (*disp_enq_thread)(cp, bound);
 1520 }
 1521 
 1522 /*
 1523  * Put a high-priority unbound thread on the kp queue
 1524  */
 1525 static void
 1526 setkpdq(kthread_t *tp, int borf)
 1527 {
 1528         dispq_t *dq;
 1529         disp_t  *dp;
 1530         cpu_t   *cp;
 1531         pri_t   tpri;
 1532 
 1533         tpri = DISP_PRIO(tp);
 1534 
 1535         dp = &tp->t_cpupart->cp_kp_queue;
 1536         disp_lock_enter_high(&dp->disp_lock);
 1537 
 1538         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
 1539 
 1540         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
 1541         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
 1542         THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
 1543         tp->t_disp_queue = dp;
 1544         dp->disp_nrunnable++;
 1545         dq = &dp->disp_q[tpri];
 1546 
 1547         if (dq->dq_sruncnt++ != 0) {
 1548                 if (borf == SETKP_BACK) {
 1549                         ASSERT(dq->dq_first != NULL);
 1550                         tp->t_link = NULL;
 1551                         dq->dq_last->t_link = tp;
 1552                         dq->dq_last = tp;
 1553                 } else {
 1554                         ASSERT(dq->dq_last != NULL);
 1555                         tp->t_link = dq->dq_first;
 1556                         dq->dq_first = tp;
 1557                 }
 1558         } else {
 1559                 if (borf == SETKP_BACK) {
 1560                         ASSERT(dq->dq_first == NULL);
 1561                         ASSERT(dq->dq_last == NULL);
 1562                         dq->dq_first = dq->dq_last = tp;
 1563                 } else {
 1564                         ASSERT(dq->dq_last == NULL);
 1565                         ASSERT(dq->dq_first == NULL);
 1566                         tp->t_link = NULL;
 1567                         dq->dq_first = dq->dq_last = tp;
 1568                 }
 1569                 BT_SET(dp->disp_qactmap, tpri);
 1570                 if (tpri > dp->disp_max_unbound_pri)
 1571                         dp->disp_max_unbound_pri = tpri;
 1572                 if (tpri > dp->disp_maxrunpri) {
 1573                         dp->disp_maxrunpri = tpri;
 1574                         membar_enter();
 1575                 }
 1576         }
 1577 
 1578         cp = tp->t_cpu;
 1579         if (tp->t_cpupart != cp->cpu_part) {
 1580                 /* migrate to a cpu in the new partition */
 1581                 cp = tp->t_cpupart->cp_cpulist;
 1582         }
 1583         cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
 1584         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
 1585         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
 1586 
 1587 #ifndef NPROBE
 1588         /* Kernel probe */
 1589         if (tnf_tracing_active)
 1590                 tnf_thread_queue(tp, cp, tpri);
 1591 #endif /* NPROBE */
 1592 
 1593         if (cp->cpu_chosen_level < tpri)
 1594                 cp->cpu_chosen_level = tpri;
 1595         cpu_resched(cp, tpri);
 1596         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
 1597         (*disp_enq_thread)(cp, 0);
 1598 }
 1599 
 1600 /*
 1601  * Remove a thread from the dispatcher queue if it is on it.
 1602  * It is not an error if it is not found but we return whether
 1603  * or not it was found in case the caller wants to check.
 1604  */
 1605 int
 1606 dispdeq(kthread_t *tp)
 1607 {
 1608         disp_t          *dp;
 1609         dispq_t         *dq;
 1610         kthread_t       *rp;
 1611         kthread_t       *trp;
 1612         kthread_t       **ptp;
 1613         int             tpri;
 1614 
 1615         ASSERT(THREAD_LOCK_HELD(tp));
 1616 
 1617         if (tp->t_state != TS_RUN)
 1618                 return (0);
 1619 
 1620         /*
 1621          * The thread is "swapped" or is on the swap queue and
 1622          * hence no longer on the run queue, so return true.
 1623          */
 1624         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
 1625                 return (1);
 1626 
 1627         tpri = DISP_PRIO(tp);
 1628         dp = tp->t_disp_queue;
 1629         ASSERT(tpri < dp->disp_npri);
 1630         dq = &dp->disp_q[tpri];
 1631         ptp = &dq->dq_first;
 1632         rp = *ptp;
 1633         trp = NULL;
 1634 
 1635         ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
 1636 
 1637         /*
 1638          * Search for thread in queue.
 1639          * Double links would simplify this at the expense of disp/setrun.
 1640          */
 1641         while (rp != tp && rp != NULL) {
 1642                 trp = rp;
 1643                 ptp = &trp->t_link;
 1644                 rp = trp->t_link;
 1645         }
 1646 
 1647         if (rp == NULL) {
 1648                 panic("dispdeq: thread not on queue");
 1649         }
 1650 
 1651         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 1652 
 1653         /*
 1654          * Found it so remove it from queue.
 1655          */
 1656         if ((*ptp = rp->t_link) == NULL)
 1657                 dq->dq_last = trp;
 1658 
 1659         dp->disp_nrunnable--;
 1660         if (--dq->dq_sruncnt == 0) {
 1661                 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
 1662                 if (dp->disp_nrunnable == 0) {
 1663                         dp->disp_max_unbound_pri = -1;
 1664                         dp->disp_maxrunpri = -1;
 1665                 } else if (tpri == dp->disp_maxrunpri) {
 1666                         int ipri;
 1667 
 1668                         ipri = bt_gethighbit(dp->disp_qactmap,
 1669                             dp->disp_maxrunpri >> BT_ULSHIFT);
 1670                         if (ipri < dp->disp_max_unbound_pri)
 1671                                 dp->disp_max_unbound_pri = ipri;
 1672                         dp->disp_maxrunpri = ipri;
 1673                 }
 1674         }
 1675         tp->t_link = NULL;
 1676         THREAD_TRANSITION(tp);          /* put in intermediate state */
 1677         return (1);
 1678 }
 1679 
 1680 
 1681 /*
 1682  * dq_sruninc and dq_srundec are public functions for
 1683  * incrementing/decrementing the sruncnts when a thread on
 1684  * a dispatcher queue is made schedulable/unschedulable by
 1685  * resetting the TS_LOAD flag.
 1686  *
 1687  * The caller MUST have the thread lock and therefore the dispatcher
 1688  * queue lock so that the operation which changes
 1689  * the flag, the operation that checks the status of the thread to
 1690  * determine if it's on a disp queue AND the call to this function
 1691  * are one atomic operation with respect to interrupts.
 1692  */
 1693 
 1694 /*
 1695  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
 1696  */
 1697 void
 1698 dq_sruninc(kthread_t *t)
 1699 {
 1700         ASSERT(t->t_state == TS_RUN);
 1701         ASSERT(t->t_schedflag & TS_LOAD);
 1702 
 1703         THREAD_TRANSITION(t);
 1704         setfrontdq(t);
 1705 }
 1706 
 1707 /*
 1708  * See comment on calling conventions above.
 1709  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
 1710  */
 1711 void
 1712 dq_srundec(kthread_t *t)
 1713 {
 1714         ASSERT(t->t_schedflag & TS_LOAD);
 1715 
 1716         (void) dispdeq(t);
 1717         disp_swapped_enq(t);
 1718 }
 1719 
 1720 /*
 1721  * Change the dispatcher lock of thread to the "swapped_lock"
 1722  * and return with thread lock still held.
 1723  *
 1724  * Called with thread_lock held, in transition state, and at high spl.
 1725  */
 1726 void
 1727 disp_swapped_enq(kthread_t *tp)
 1728 {
 1729         ASSERT(THREAD_LOCK_HELD(tp));
 1730         ASSERT(tp->t_schedflag & TS_LOAD);
 1731 
 1732         switch (tp->t_state) {
 1733         case TS_RUN:
 1734                 disp_lock_enter_high(&swapped_lock);
 1735                 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
 1736                 break;
 1737         case TS_ONPROC:
 1738                 disp_lock_enter_high(&swapped_lock);
 1739                 THREAD_TRANSITION(tp);
 1740                 wake_sched_sec = 1;             /* tell clock to wake sched */
 1741                 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
 1742                 break;
 1743         default:
 1744                 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
 1745         }
 1746 }
 1747 
 1748 /*
 1749  * This routine is called by setbackdq/setfrontdq if the thread is
 1750  * not loaded or loaded and on the swap queue.
 1751  *
 1752  * Thread state TS_SLEEP implies that a swapped thread
 1753  * has been woken up and needs to be swapped in by the swapper.
 1754  *
 1755  * Thread state TS_RUN, it implies that the priority of a swapped
 1756  * thread is being increased by scheduling class (e.g. ts_update).
 1757  */
 1758 static void
 1759 disp_swapped_setrun(kthread_t *tp)
 1760 {
 1761         ASSERT(THREAD_LOCK_HELD(tp));
 1762         ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
 1763 
 1764         switch (tp->t_state) {
 1765         case TS_SLEEP:
 1766                 disp_lock_enter_high(&swapped_lock);
 1767                 /*
 1768                  * Wakeup sched immediately (i.e., next tick) if the
 1769                  * thread priority is above maxclsyspri.
 1770                  */
 1771                 if (DISP_PRIO(tp) > maxclsyspri)
 1772                         wake_sched = 1;
 1773                 else
 1774                         wake_sched_sec = 1;
 1775                 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
 1776                 break;
 1777         case TS_RUN:                            /* called from ts_update */
 1778                 break;
 1779         default:
 1780                 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
 1781         }
 1782 }
 1783 
 1784 /*
 1785  *      Make a thread give up its processor.  Find the processor on
 1786  *      which this thread is executing, and have that processor
 1787  *      preempt.
 1788  *
 1789  *      We allow System Duty Cycle (SDC) threads to be preempted even if
 1790  *      they are running at kernel priorities.  To implement this, we always
 1791  *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
 1792  *      calls cpu_surrender() very often, we only preempt if there is anyone
 1793  *      competing with us.
 1794  */
 1795 void
 1796 cpu_surrender(kthread_t *tp)
 1797 {
 1798         cpu_t   *cpup;
 1799         int     max_pri;
 1800         int     max_run_pri;
 1801         klwp_t  *lwp;
 1802 
 1803         ASSERT(THREAD_LOCK_HELD(tp));
 1804 
 1805         if (tp->t_state != TS_ONPROC)
 1806                 return;
 1807         cpup = tp->t_disp_queue->disp_cpu;      /* CPU thread dispatched to */
 1808         max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
 1809         max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
 1810         if (max_pri < max_run_pri)
 1811                 max_pri = max_run_pri;
 1812 
 1813         if (tp->t_cid == sysdccid) {
 1814                 uint_t t_pri = DISP_PRIO(tp);
 1815                 if (t_pri > max_pri)
 1816                         return;         /* we are not competing w/ anyone */
 1817                 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
 1818         } else {
 1819                 cpup->cpu_runrun = 1;
 1820                 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
 1821                         cpup->cpu_kprunrun = 1;
 1822                 }
 1823         }
 1824 
 1825         /*
 1826          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
 1827          */
 1828         membar_enter();
 1829 
 1830         DTRACE_SCHED1(surrender, kthread_t *, tp);
 1831 
 1832         /*
 1833          * Make the target thread take an excursion through trap()
 1834          * to do preempt() (unless we're already in trap or post_syscall,
 1835          * calling cpu_surrender via CL_TRAPRET).
 1836          */
 1837         if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
 1838             lwp->lwp_state != LWP_USER) {
 1839                 aston(tp);
 1840                 if (cpup != CPU)
 1841                         poke_cpu(cpup->cpu_id);
 1842         }
 1843         TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
 1844             "cpu_surrender:tid %p cpu %p", tp, cpup);
 1845 }
 1846 
 1847 /*
 1848  * Commit to and ratify a scheduling decision
 1849  */
 1850 /*ARGSUSED*/
 1851 static kthread_t *
 1852 disp_ratify(kthread_t *tp, disp_t *kpq)
 1853 {
 1854         pri_t   tpri, maxpri;
 1855         pri_t   maxkpri;
 1856         cpu_t   *cpup;
 1857 
 1858         ASSERT(tp != NULL);
 1859         /*
 1860          * Commit to, then ratify scheduling decision
 1861          */
 1862         cpup = CPU;
 1863         if (cpup->cpu_runrun != 0)
 1864                 cpup->cpu_runrun = 0;
 1865         if (cpup->cpu_kprunrun != 0)
 1866                 cpup->cpu_kprunrun = 0;
 1867         if (cpup->cpu_chosen_level != -1)
 1868                 cpup->cpu_chosen_level = -1;
 1869         membar_enter();
 1870         tpri = DISP_PRIO(tp);
 1871         maxpri = cpup->cpu_disp->disp_maxrunpri;
 1872         maxkpri = kpq->disp_maxrunpri;
 1873         if (maxpri < maxkpri)
 1874                 maxpri = maxkpri;
 1875         if (tpri < maxpri) {
 1876                 /*
 1877                  * should have done better
 1878                  * put this one back and indicate to try again
 1879                  */
 1880                 cpup->cpu_dispthread = curthread;       /* fixup dispthread */
 1881                 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
 1882                 thread_lock_high(tp);
 1883                 THREAD_TRANSITION(tp);
 1884                 setfrontdq(tp);
 1885                 thread_unlock_nopreempt(tp);
 1886 
 1887                 tp = NULL;
 1888         }
 1889         return (tp);
 1890 }
 1891 
 1892 /*
 1893  * See if there is any work on the dispatcher queue for other CPUs.
 1894  * If there is, dequeue the best thread and return.
 1895  */
 1896 static kthread_t *
 1897 disp_getwork(cpu_t *cp)
 1898 {
 1899         cpu_t           *ocp;           /* other CPU */
 1900         cpu_t           *ocp_start;
 1901         cpu_t           *tcp;           /* target local CPU */
 1902         kthread_t       *tp;
 1903         kthread_t       *retval = NULL;
 1904         pri_t           maxpri;
 1905         disp_t          *kpq;           /* kp queue for this partition */
 1906         lpl_t           *lpl, *lpl_leaf;
 1907         int             leafidx, startidx;
 1908         hrtime_t        stealtime;
 1909         lgrp_id_t       local_id;
 1910 
 1911         maxpri = -1;
 1912         tcp = NULL;
 1913 
 1914         kpq = &cp->cpu_part->cp_kp_queue;
 1915         while (kpq->disp_maxrunpri >= 0) {
 1916                 /*
 1917                  * Try to take a thread from the kp_queue.
 1918                  */
 1919                 tp = (disp_getbest(kpq));
 1920                 if (tp)
 1921                         return (disp_ratify(tp, kpq));
 1922         }
 1923 
 1924         kpreempt_disable();             /* protect the cpu_active list */
 1925 
 1926         /*
 1927          * Try to find something to do on another CPU's run queue.
 1928          * Loop through all other CPUs looking for the one with the highest
 1929          * priority unbound thread.
 1930          *
 1931          * On NUMA machines, the partition's CPUs are consulted in order of
 1932          * distance from the current CPU. This way, the first available
 1933          * work found is also the closest, and will suffer the least
 1934          * from being migrated.
 1935          */
 1936         lpl = lpl_leaf = cp->cpu_lpl;
 1937         local_id = lpl_leaf->lpl_lgrpid;
 1938         leafidx = startidx = 0;
 1939 
 1940         /*
 1941          * This loop traverses the lpl hierarchy. Higher level lpls represent
 1942          * broader levels of locality
 1943          */
 1944         do {
 1945                 /* This loop iterates over the lpl's leaves */
 1946                 do {
 1947                         if (lpl_leaf != cp->cpu_lpl)
 1948                                 ocp = lpl_leaf->lpl_cpus;
 1949                         else
 1950                                 ocp = cp->cpu_next_lpl;
 1951 
 1952                         /* This loop iterates over the CPUs in the leaf */
 1953                         ocp_start = ocp;
 1954                         do {
 1955                                 pri_t pri;
 1956 
 1957                                 ASSERT(CPU_ACTIVE(ocp));
 1958 
 1959                                 /*
 1960                                  * End our stroll around this lpl if:
 1961                                  *
 1962                                  * - Something became runnable on the local
 1963                                  *   queue...which also ends our stroll around
 1964                                  *   the partition.
 1965                                  *
 1966                                  * - We happen across another idle CPU.
 1967                                  *   Since it is patrolling the next portion
 1968                                  *   of the lpl's list (assuming it's not
 1969                                  *   halted, or busy servicing an interrupt),
 1970                                  *   move to the next higher level of locality.
 1971                                  */
 1972                                 if (cp->cpu_disp->disp_nrunnable != 0) {
 1973                                         kpreempt_enable();
 1974                                         return (NULL);
 1975                                 }
 1976                                 if (ocp->cpu_dispatch_pri == -1) {
 1977                                         if (ocp->cpu_disp_flags &
 1978                                             CPU_DISP_HALTED ||
 1979                                             ocp->cpu_intr_actv != 0)
 1980                                                 continue;
 1981                                         else
 1982                                                 goto next_level;
 1983                                 }
 1984 
 1985                                 /*
 1986                                  * If there's only one thread and the CPU
 1987                                  * is in the middle of a context switch,
 1988                                  * or it's currently running the idle thread,
 1989                                  * don't steal it.
 1990                                  */
 1991                                 if ((ocp->cpu_disp_flags &
 1992                                     CPU_DISP_DONTSTEAL) &&
 1993                                     ocp->cpu_disp->disp_nrunnable == 1)
 1994                                         continue;
 1995 
 1996                                 pri = ocp->cpu_disp->disp_max_unbound_pri;
 1997                                 if (pri > maxpri) {
 1998                                         /*
 1999                                          * Don't steal threads that we attempted
 2000                                          * to steal recently until they're ready
 2001                                          * to be stolen again.
 2002                                          */
 2003                                         stealtime = ocp->cpu_disp->disp_steal;
 2004                                         if (stealtime == 0 ||
 2005                                             stealtime - gethrtime() <= 0) {
 2006                                                 maxpri = pri;
 2007                                                 tcp = ocp;
 2008                                         } else {
 2009                                                 /*
 2010                                                  * Don't update tcp, just set
 2011                                                  * the retval to T_DONTSTEAL, so
 2012                                                  * that if no acceptable CPUs
 2013                                                  * are found the return value
 2014                                                  * will be T_DONTSTEAL rather
 2015                                                  * then NULL.
 2016                                                  */
 2017                                                 retval = T_DONTSTEAL;
 2018                                         }
 2019                                 }
 2020                         } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
 2021 
 2022                         /*
 2023                          * Iterate to the next leaf lpl in the resource set
 2024                          * at this level of locality. If we hit the end of
 2025                          * the set, wrap back around to the beginning.
 2026                          *
 2027                          * Note: This iteration is NULL terminated for a reason
 2028                          * see lpl_topo_bootstrap() in lgrp.c for details.
 2029                          */
 2030                         if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
 2031                                 leafidx = 0;
 2032                                 lpl_leaf = lpl->lpl_rset[leafidx];
 2033                         }
 2034                 } while (leafidx != startidx);
 2035 
 2036 next_level:
 2037                 /*
 2038                  * Expand the search to include farther away CPUs (next
 2039                  * locality level). The closer CPUs that have already been
 2040                  * checked will be checked again. In doing so, idle CPUs
 2041                  * will tend to be more aggresive about stealing from CPUs
 2042                  * that are closer (since the closer CPUs will be considered
 2043                  * more often).
 2044                  * Begin at this level with the CPUs local leaf lpl.
 2045                  */
 2046                 if ((lpl = lpl->lpl_parent) != NULL) {
 2047                         leafidx = startidx = lpl->lpl_id2rset[local_id];
 2048                         lpl_leaf = lpl->lpl_rset[leafidx];
 2049                 }
 2050         } while (!tcp && lpl);
 2051 
 2052         kpreempt_enable();
 2053 
 2054         /*
 2055          * If another queue looks good, and there is still nothing on
 2056          * the local queue, try to transfer one or more threads
 2057          * from it to our queue.
 2058          */
 2059         if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
 2060                 tp = disp_getbest(tcp->cpu_disp);
 2061                 if (tp == NULL || tp == T_DONTSTEAL)
 2062                         return (tp);
 2063                 return (disp_ratify(tp, kpq));
 2064         }
 2065         return (retval);
 2066 }
 2067 
 2068 
 2069 /*
 2070  * disp_fix_unbound_pri()
 2071  *      Determines the maximum priority of unbound threads on the queue.
 2072  *      The priority is kept for the queue, but is only increased, never
 2073  *      reduced unless some CPU is looking for something on that queue.
 2074  *
 2075  *      The priority argument is the known upper limit.
 2076  *
 2077  *      Perhaps this should be kept accurately, but that probably means
 2078  *      separate bitmaps for bound and unbound threads.  Since only idled
 2079  *      CPUs will have to do this recalculation, it seems better this way.
 2080  */
 2081 static void
 2082 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
 2083 {
 2084         kthread_t       *tp;
 2085         dispq_t         *dq;
 2086         ulong_t         *dqactmap = dp->disp_qactmap;
 2087         ulong_t         mapword;
 2088         int             wx;
 2089 
 2090         ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
 2091 
 2092         ASSERT(pri >= 0);                       /* checked by caller */
 2093 
 2094         /*
 2095          * Start the search at the next lowest priority below the supplied
 2096          * priority.  This depends on the bitmap implementation.
 2097          */
 2098         do {
 2099                 wx = pri >> BT_ULSHIFT;         /* index of word in map */
 2100 
 2101                 /*
 2102                  * Form mask for all lower priorities in the word.
 2103                  */
 2104                 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
 2105 
 2106                 /*
 2107                  * Get next lower active priority.
 2108                  */
 2109                 if (mapword != 0) {
 2110                         pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
 2111                 } else if (wx > 0) {
 2112                         pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
 2113                         if (pri < 0)
 2114                                 break;
 2115                 } else {
 2116                         pri = -1;
 2117                         break;
 2118                 }
 2119 
 2120                 /*
 2121                  * Search the queue for unbound, runnable threads.
 2122                  */
 2123                 dq = &dp->disp_q[pri];
 2124                 tp = dq->dq_first;
 2125 
 2126                 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
 2127                         tp = tp->t_link;
 2128                 }
 2129 
 2130                 /*
 2131                  * If a thread was found, set the priority and return.
 2132                  */
 2133         } while (tp == NULL);
 2134 
 2135         /*
 2136          * pri holds the maximum unbound thread priority or -1.
 2137          */
 2138         if (dp->disp_max_unbound_pri != pri)
 2139                 dp->disp_max_unbound_pri = pri;
 2140 }
 2141 
 2142 /*
 2143  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
 2144  *      check if the CPU to which is was previously bound should have
 2145  *      its disp_max_unbound_pri increased.
 2146  */
 2147 void
 2148 disp_adjust_unbound_pri(kthread_t *tp)
 2149 {
 2150         disp_t *dp;
 2151         pri_t tpri;
 2152 
 2153         ASSERT(THREAD_LOCK_HELD(tp));
 2154 
 2155         /*
 2156          * Don't do anything if the thread is not bound, or
 2157          * currently not runnable or swapped out.
 2158          */
 2159         if (tp->t_bound_cpu == NULL ||
 2160             tp->t_state != TS_RUN ||
 2161             tp->t_schedflag & TS_ON_SWAPQ)
 2162                 return;
 2163 
 2164         tpri = DISP_PRIO(tp);
 2165         dp = tp->t_bound_cpu->cpu_disp;
 2166         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
 2167         if (tpri > dp->disp_max_unbound_pri)
 2168                 dp->disp_max_unbound_pri = tpri;
 2169 }
 2170 
 2171 /*
 2172  * disp_getbest()
 2173  *   De-queue the highest priority unbound runnable thread.
 2174  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
 2175  *   Returns NULL if nothing found.
 2176  *   Returns T_DONTSTEAL if the thread was not stealable.
 2177  *   so that the caller will try again later.
 2178  *
 2179  *   Passed a pointer to a dispatch queue not associated with this CPU, and
 2180  *   its type.
 2181  */
 2182 static kthread_t *
 2183 disp_getbest(disp_t *dp)
 2184 {
 2185         kthread_t       *tp;
 2186         dispq_t         *dq;
 2187         pri_t           pri;
 2188         cpu_t           *cp, *tcp;
 2189         boolean_t       allbound;
 2190 
 2191         disp_lock_enter(&dp->disp_lock);
 2192 
 2193         /*
 2194          * If there is nothing to run, or the CPU is in the middle of a
 2195          * context switch of the only thread, return NULL.
 2196          */
 2197         tcp = dp->disp_cpu;
 2198         cp = CPU;
 2199         pri = dp->disp_max_unbound_pri;
 2200         if (pri == -1 ||
 2201             (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 2202             tcp->cpu_disp->disp_nrunnable == 1)) {
 2203                 disp_lock_exit_nopreempt(&dp->disp_lock);
 2204                 return (NULL);
 2205         }
 2206 
 2207         dq = &dp->disp_q[pri];
 2208 
 2209 
 2210         /*
 2211          * Assume that all threads are bound on this queue, and change it
 2212          * later when we find out that it is not the case.
 2213          */
 2214         allbound = B_TRUE;
 2215         for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
 2216                 hrtime_t now, nosteal, rqtime;
 2217 
 2218                 /*
 2219                  * Skip over bound threads which could be here even
 2220                  * though disp_max_unbound_pri indicated this level.
 2221                  */
 2222                 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
 2223                         continue;
 2224 
 2225                 /*
 2226                  * We've got some unbound threads on this queue, so turn
 2227                  * the allbound flag off now.
 2228                  */
 2229                 allbound = B_FALSE;
 2230 
 2231                 /*
 2232                  * The thread is a candidate for stealing from its run queue. We
 2233                  * don't want to steal threads that became runnable just a
 2234                  * moment ago. This improves CPU affinity for threads that get
 2235                  * preempted for short periods of time and go back on the run
 2236                  * queue.
 2237                  *
 2238                  * We want to let it stay on its run queue if it was only placed
 2239                  * there recently and it was running on the same CPU before that
 2240                  * to preserve its cache investment. For the thread to remain on
 2241                  * its run queue, ALL of the following conditions must be
 2242                  * satisfied:
 2243                  *
 2244                  * - the disp queue should not be the kernel preemption queue
 2245                  * - delayed idle stealing should not be disabled
 2246                  * - nosteal_nsec should be non-zero
 2247                  * - it should run with user priority
 2248                  * - it should be on the run queue of the CPU where it was
 2249                  *   running before being placed on the run queue
 2250                  * - it should be the only thread on the run queue (to prevent
 2251                  *   extra scheduling latency for other threads)
 2252                  * - it should sit on the run queue for less than per-chip
 2253                  *   nosteal interval or global nosteal interval
 2254                  * - in case of CPUs with shared cache it should sit in a run
 2255                  *   queue of a CPU from a different chip
 2256                  *
 2257                  * The checks are arranged so that the ones that are faster are
 2258                  * placed earlier.
 2259                  */
 2260                 if (tcp == NULL ||
 2261                     pri >= minclsyspri ||
 2262                     tp->t_cpu != tcp)
 2263                         break;
 2264 
 2265                 /*
 2266                  * Steal immediately if, due to CMT processor architecture
 2267                  * migraiton between cp and tcp would incur no performance
 2268                  * penalty.
 2269                  */
 2270                 if (pg_cmt_can_migrate(cp, tcp))
 2271                         break;
 2272 
 2273                 nosteal = nosteal_nsec;
 2274                 if (nosteal == 0)
 2275                         break;
 2276 
 2277                 /*
 2278                  * Calculate time spent sitting on run queue
 2279                  */
 2280                 now = gethrtime_unscaled();
 2281                 rqtime = now - tp->t_waitrq;
 2282                 scalehrtime(&rqtime);
 2283 
 2284                 /*
 2285                  * Steal immediately if the time spent on this run queue is more
 2286                  * than allowed nosteal delay.
 2287                  *
 2288                  * Negative rqtime check is needed here to avoid infinite
 2289                  * stealing delays caused by unlikely but not impossible
 2290                  * drifts between CPU times on different CPUs.
 2291                  */
 2292                 if (rqtime > nosteal || rqtime < 0)
 2293                         break;
 2294 
 2295                 DTRACE_PROBE4(nosteal, kthread_t *, tp,
 2296                     cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
 2297                 scalehrtime(&now);
 2298                 /*
 2299                  * Calculate when this thread becomes stealable
 2300                  */
 2301                 now += (nosteal - rqtime);
 2302 
 2303                 /*
 2304                  * Calculate time when some thread becomes stealable
 2305                  */
 2306                 if (now < dp->disp_steal)
 2307                         dp->disp_steal = now;
 2308         }
 2309 
 2310         /*
 2311          * If there were no unbound threads on this queue, find the queue
 2312          * where they are and then return later. The value of
 2313          * disp_max_unbound_pri is not always accurate because it isn't
 2314          * reduced until another idle CPU looks for work.
 2315          */
 2316         if (allbound)
 2317                 disp_fix_unbound_pri(dp, pri);
 2318 
 2319         /*
 2320          * If we reached the end of the queue and found no unbound threads
 2321          * then return NULL so that other CPUs will be considered.  If there
 2322          * are unbound threads but they cannot yet be stolen, then
 2323          * return T_DONTSTEAL and try again later.
 2324          */
 2325         if (tp == NULL) {
 2326                 disp_lock_exit_nopreempt(&dp->disp_lock);
 2327                 return (allbound ? NULL : T_DONTSTEAL);
 2328         }
 2329 
 2330         /*
 2331          * Found a runnable, unbound thread, so remove it from queue.
 2332          * dispdeq() requires that we have the thread locked, and we do,
 2333          * by virtue of holding the dispatch queue lock.  dispdeq() will
 2334          * put the thread in transition state, thereby dropping the dispq
 2335          * lock.
 2336          */
 2337 
 2338 #ifdef DEBUG
 2339         {
 2340                 int     thread_was_on_queue;
 2341 
 2342                 thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
 2343                 ASSERT(thread_was_on_queue);
 2344         }
 2345 
 2346 #else /* DEBUG */
 2347         (void) dispdeq(tp);                     /* drops disp_lock */
 2348 #endif /* DEBUG */
 2349 
 2350         /*
 2351          * Reset the disp_queue steal time - we do not know what is the smallest
 2352          * value across the queue is.
 2353          */
 2354         dp->disp_steal = 0;
 2355 
 2356         tp->t_schedflag |= TS_DONT_SWAP;
 2357 
 2358         /*
 2359          * Setup thread to run on the current CPU.
 2360          */
 2361         tp->t_disp_queue = cp->cpu_disp;
 2362 
 2363         cp->cpu_dispthread = tp;                /* protected by spl only */
 2364         cp->cpu_dispatch_pri = pri;
 2365 
 2366         /*
 2367          * There can be a memory synchronization race between disp_getbest()
 2368          * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
 2369          * to preempt the current thread to run the enqueued thread while
 2370          * disp_getbest() and disp_ratify() are changing the current thread
 2371          * to the stolen thread. This may lead to a situation where
 2372          * cpu_resched() tries to preempt the wrong thread and the
 2373          * stolen thread continues to run on the CPU which has been tagged
 2374          * for preemption.
 2375          * Later the clock thread gets enqueued but doesn't get to run on the
 2376          * CPU causing the system to hang.
 2377          *
 2378          * To avoid this, grabbing and dropping the disp_lock (which does
 2379          * a memory barrier) is needed to synchronize the execution of
 2380          * cpu_resched() with disp_getbest() and disp_ratify() and
 2381          * synchronize the memory read and written by cpu_resched(),
 2382          * disp_getbest(), and disp_ratify() with each other.
 2383          *  (see CR#6482861 for more details).
 2384          */
 2385         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
 2386         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
 2387 
 2388         ASSERT(pri == DISP_PRIO(tp));
 2389 
 2390         DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
 2391 
 2392         thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
 2393 
 2394         /*
 2395          * Return with spl high so that swtch() won't need to raise it.
 2396          * The disp_lock was dropped by dispdeq().
 2397          */
 2398 
 2399         return (tp);
 2400 }
 2401 
 2402 /*
 2403  * disp_bound_common() - common routine for higher level functions
 2404  *      that check for bound threads under certain conditions.
 2405  *      If 'threadlistsafe' is set then there is no need to acquire
 2406  *      pidlock to stop the thread list from changing (eg, if
 2407  *      disp_bound_* is called with cpus paused).
 2408  */
 2409 static int
 2410 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
 2411 {
 2412         int             found = 0;
 2413         kthread_t       *tp;
 2414 
 2415         ASSERT(flag);
 2416 
 2417         if (!threadlistsafe)
 2418                 mutex_enter(&pidlock);
 2419         tp = curthread;         /* faster than allthreads */
 2420         do {
 2421                 if (tp->t_state != TS_FREE) {
 2422                         /*
 2423                          * If an interrupt thread is busy, but the
 2424                          * caller doesn't care (i.e. BOUND_INTR is off),
 2425                          * then just ignore it and continue through.
 2426                          */
 2427                         if ((tp->t_flag & T_INTR_THREAD) &&
 2428                             !(flag & BOUND_INTR))
 2429                                 continue;
 2430 
 2431                         /*
 2432                          * Skip the idle thread for the CPU
 2433                          * we're about to set offline.
 2434                          */
 2435                         if (tp == cp->cpu_idle_thread)
 2436                                 continue;
 2437 
 2438                         /*
 2439                          * Skip the pause thread for the CPU
 2440                          * we're about to set offline.
 2441                          */
 2442                         if (tp == cp->cpu_pause_thread)
 2443                                 continue;
 2444 
 2445                         if ((flag & BOUND_CPU) &&
 2446                             (tp->t_bound_cpu == cp ||
 2447                             tp->t_bind_cpu == cp->cpu_id ||
 2448                             tp->t_weakbound_cpu == cp)) {
 2449                                 found = 1;
 2450                                 break;
 2451                         }
 2452 
 2453                         if ((flag & BOUND_PARTITION) &&
 2454                             (tp->t_cpupart == cp->cpu_part)) {
 2455                                 found = 1;
 2456                                 break;
 2457                         }
 2458                 }
 2459         } while ((tp = tp->t_next) != curthread && found == 0);
 2460         if (!threadlistsafe)
 2461                 mutex_exit(&pidlock);
 2462         return (found);
 2463 }
 2464 
 2465 /*
 2466  * disp_bound_threads - return nonzero if threads are bound to the processor.
 2467  *      Called infrequently.  Keep this simple.
 2468  *      Includes threads that are asleep or stopped but not onproc.
 2469  */
 2470 int
 2471 disp_bound_threads(cpu_t *cp, int threadlistsafe)
 2472 {
 2473         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
 2474 }
 2475 
 2476 /*
 2477  * disp_bound_anythreads - return nonzero if _any_ threads are bound
 2478  * to the given processor, including interrupt threads.
 2479  */
 2480 int
 2481 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
 2482 {
 2483         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
 2484 }
 2485 
 2486 /*
 2487  * disp_bound_partition - return nonzero if threads are bound to the same
 2488  * partition as the processor.
 2489  *      Called infrequently.  Keep this simple.
 2490  *      Includes threads that are asleep or stopped but not onproc.
 2491  */
 2492 int
 2493 disp_bound_partition(cpu_t *cp, int threadlistsafe)
 2494 {
 2495         return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
 2496 }
 2497 
 2498 /*
 2499  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
 2500  * threads to other CPUs.
 2501  */
 2502 void
 2503 disp_cpu_inactive(cpu_t *cp)
 2504 {
 2505         kthread_t       *tp;
 2506         disp_t          *dp = cp->cpu_disp;
 2507         dispq_t         *dq;
 2508         pri_t           pri;
 2509         int             wasonq;
 2510 
 2511         disp_lock_enter(&dp->disp_lock);
 2512         while ((pri = dp->disp_max_unbound_pri) != -1) {
 2513                 dq = &dp->disp_q[pri];
 2514                 tp = dq->dq_first;
 2515 
 2516                 /*
 2517                  * Skip over bound threads.
 2518                  */
 2519                 while (tp != NULL && tp->t_bound_cpu != NULL) {
 2520                         tp = tp->t_link;
 2521                 }
 2522 
 2523                 if (tp == NULL) {
 2524                         /* disp_max_unbound_pri must be inaccurate, so fix it */
 2525                         disp_fix_unbound_pri(dp, pri);
 2526                         continue;
 2527                 }
 2528 
 2529                 wasonq = dispdeq(tp);           /* drops disp_lock */
 2530                 ASSERT(wasonq);
 2531                 ASSERT(tp->t_weakbound_cpu == NULL);
 2532 
 2533                 setbackdq(tp);
 2534                 /*
 2535                  * Called from cpu_offline:
 2536                  *
 2537                  * cp has already been removed from the list of active cpus
 2538                  * and tp->t_cpu has been changed so there is no risk of
 2539                  * tp ending up back on cp.
 2540                  *
 2541                  * Called from cpupart_move_cpu:
 2542                  *
 2543                  * The cpu has moved to a new cpupart.  Any threads that
 2544                  * were on it's dispatch queues before the move remain
 2545                  * in the old partition and can't run in the new partition.
 2546                  */
 2547                 ASSERT(tp->t_cpu != cp);
 2548                 thread_unlock(tp);
 2549 
 2550                 disp_lock_enter(&dp->disp_lock);
 2551         }
 2552         disp_lock_exit(&dp->disp_lock);
 2553 }
 2554 
 2555 /*
 2556  * disp_lowpri_cpu - find CPU running the lowest priority thread.
 2557  *      The hint passed in is used as a starting point so we don't favor
 2558  *      CPU 0 or any other CPU.  The caller should pass in the most recently
 2559  *      used CPU for the thread.
 2560  *
 2561  *      The lgroup and priority are used to determine the best CPU to run on
 2562  *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
 2563  *      the thread priority will indicate whether the thread will actually run
 2564  *      there.  To pick the best CPU, the CPUs inside and outside of the given
 2565  *      lgroup which are running the lowest priority threads are found.  The
 2566  *      remote CPU is chosen only if the thread will not run locally on a CPU
 2567  *      within the lgroup, but will run on the remote CPU. If the thread
 2568  *      cannot immediately run on any CPU, the best local CPU will be chosen.
 2569  *
 2570  *      The lpl specified also identifies the cpu partition from which
 2571  *      disp_lowpri_cpu should select a CPU.
 2572  *
 2573  *      curcpu is used to indicate that disp_lowpri_cpu is being called on
 2574  *      behalf of the current thread. (curthread is looking for a new cpu)
 2575  *      In this case, cpu_dispatch_pri for this thread's cpu should be
 2576  *      ignored.
 2577  *
 2578  *      If a cpu is the target of an offline request then try to avoid it.
 2579  *
 2580  *      This function must be called at either high SPL, or with preemption
 2581  *      disabled, so that the "hint" CPU cannot be removed from the online
 2582  *      CPU list while we are traversing it.
 2583  */
 2584 cpu_t *
 2585 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
 2586 {
 2587         cpu_t   *bestcpu;
 2588         cpu_t   *besthomecpu;
 2589         cpu_t   *cp, *cpstart;
 2590 
 2591         pri_t   bestpri;
 2592         pri_t   cpupri;
 2593 
 2594         klgrpset_t      done;
 2595         klgrpset_t      cur_set;
 2596 
 2597         lpl_t           *lpl_iter, *lpl_leaf;
 2598         int             i;
 2599 
 2600         /*
 2601          * Scan for a CPU currently running the lowest priority thread.
 2602          * Cannot get cpu_lock here because it is adaptive.
 2603          * We do not require lock on CPU list.
 2604          */
 2605         ASSERT(hint != NULL);
 2606         ASSERT(lpl != NULL);
 2607         ASSERT(lpl->lpl_ncpu > 0);
 2608 
 2609         /*
 2610          * First examine local CPUs. Note that it's possible the hint CPU
 2611          * passed in in remote to the specified home lgroup. If our priority
 2612          * isn't sufficient enough such that we can run immediately at home,
 2613          * then examine CPUs remote to our home lgroup.
 2614          * We would like to give preference to CPUs closest to "home".
 2615          * If we can't find a CPU where we'll run at a given level
 2616          * of locality, we expand our search to include the next level.
 2617          */
 2618         bestcpu = besthomecpu = NULL;
 2619         klgrpset_clear(done);
 2620         /* start with lpl we were passed */
 2621 
 2622         lpl_iter = lpl;
 2623 
 2624         do {
 2625 
 2626                 bestpri = SHRT_MAX;
 2627                 klgrpset_clear(cur_set);
 2628 
 2629                 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
 2630                         lpl_leaf = lpl_iter->lpl_rset[i];
 2631                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
 2632                                 continue;
 2633 
 2634                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
 2635 
 2636                         if (hint->cpu_lpl == lpl_leaf)
 2637                                 cp = cpstart = hint;
 2638                         else
 2639                                 cp = cpstart = lpl_leaf->lpl_cpus;
 2640 
 2641                         do {
 2642                                 if (cp == curcpu)
 2643                                         cpupri = -1;
 2644                                 else if (cp == cpu_inmotion)
 2645                                         cpupri = SHRT_MAX;
 2646                                 else
 2647                                         cpupri = cp->cpu_dispatch_pri;
 2648                                 if (cp->cpu_disp->disp_maxrunpri > cpupri)
 2649                                         cpupri = cp->cpu_disp->disp_maxrunpri;
 2650                                 if (cp->cpu_chosen_level > cpupri)
 2651                                         cpupri = cp->cpu_chosen_level;
 2652                                 if (cpupri < bestpri) {
 2653                                         if (CPU_IDLING(cpupri)) {
 2654                                                 ASSERT((cp->cpu_flags &
 2655                                                     CPU_QUIESCED) == 0);
 2656                                                 return (cp);
 2657                                         }
 2658                                         bestcpu = cp;
 2659                                         bestpri = cpupri;
 2660                                 }
 2661                         } while ((cp = cp->cpu_next_lpl) != cpstart);
 2662                 }
 2663 
 2664                 if (bestcpu && (tpri > bestpri)) {
 2665                         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
 2666                         return (bestcpu);
 2667                 }
 2668                 if (besthomecpu == NULL)
 2669                         besthomecpu = bestcpu;
 2670                 /*
 2671                  * Add the lgrps we just considered to the "done" set
 2672                  */
 2673                 klgrpset_or(done, cur_set);
 2674 
 2675         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
 2676 
 2677         /*
 2678          * The specified priority isn't high enough to run immediately
 2679          * anywhere, so just return the best CPU from the home lgroup.
 2680          */
 2681         ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
 2682         return (besthomecpu);
 2683 }
 2684 
 2685 /*
 2686  * This routine provides the generic idle cpu function for all processors.
 2687  * If a processor has some specific code to execute when idle (say, to stop
 2688  * the pipeline and save power) then that routine should be defined in the
 2689  * processors specific code (module_xx.c) and the global variable idle_cpu
 2690  * set to that function.
 2691  */
 2692 static void
 2693 generic_idle_cpu(void)
 2694 {
 2695 }
 2696 
 2697 /*ARGSUSED*/
 2698 static void
 2699 generic_enq_thread(cpu_t *cpu, int bound)
 2700 {
 2701 }

Cache object: e636b5a4689801398ee85d1611475ea7


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.