The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/common/disp/thread.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or http://www.opensolaris.org/os/licensing.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
   24  */
   25 
   26 #include <sys/types.h>
   27 #include <sys/param.h>
   28 #include <sys/sysmacros.h>
   29 #include <sys/signal.h>
   30 #include <sys/stack.h>
   31 #include <sys/pcb.h>
   32 #include <sys/user.h>
   33 #include <sys/systm.h>
   34 #include <sys/sysinfo.h>
   35 #include <sys/errno.h>
   36 #include <sys/cmn_err.h>
   37 #include <sys/cred.h>
   38 #include <sys/resource.h>
   39 #include <sys/task.h>
   40 #include <sys/project.h>
   41 #include <sys/proc.h>
   42 #include <sys/debug.h>
   43 #include <sys/disp.h>
   44 #include <sys/class.h>
   45 #include <vm/seg_kmem.h>
   46 #include <vm/seg_kp.h>
   47 #include <sys/machlock.h>
   48 #include <sys/kmem.h>
   49 #include <sys/varargs.h>
   50 #include <sys/turnstile.h>
   51 #include <sys/poll.h>
   52 #include <sys/vtrace.h>
   53 #include <sys/callb.h>
   54 #include <c2/audit.h>
   55 #include <sys/tnf.h>
   56 #include <sys/sobject.h>
   57 #include <sys/cpupart.h>
   58 #include <sys/pset.h>
   59 #include <sys/door.h>
   60 #include <sys/spl.h>
   61 #include <sys/copyops.h>
   62 #include <sys/rctl.h>
   63 #include <sys/brand.h>
   64 #include <sys/pool.h>
   65 #include <sys/zone.h>
   66 #include <sys/tsol/label.h>
   67 #include <sys/tsol/tndb.h>
   68 #include <sys/cpc_impl.h>
   69 #include <sys/sdt.h>
   70 #include <sys/reboot.h>
   71 #include <sys/kdi.h>
   72 #include <sys/schedctl.h>
   73 #include <sys/waitq.h>
   74 #include <sys/cpucaps.h>
   75 #include <sys/kiconv.h>
   76 
   77 struct kmem_cache *thread_cache;        /* cache of free threads */
   78 struct kmem_cache *lwp_cache;           /* cache of free lwps */
   79 struct kmem_cache *turnstile_cache;     /* cache of free turnstiles */
   80 
   81 /*
   82  * allthreads is only for use by kmem_readers.  All kernel loops can use
   83  * the current thread as a start/end point.
   84  */
   85 static kthread_t *allthreads = &t0;     /* circular list of all threads */
   86 
   87 static kcondvar_t reaper_cv;            /* synchronization var */
   88 kthread_t       *thread_deathrow;       /* circular list of reapable threads */
   89 kthread_t       *lwp_deathrow;          /* circular list of reapable threads */
   90 kmutex_t        reaplock;               /* protects lwp and thread deathrows */
   91 int     thread_reapcnt = 0;             /* number of threads on deathrow */
   92 int     lwp_reapcnt = 0;                /* number of lwps on deathrow */
   93 int     reaplimit = 16;                 /* delay reaping until reaplimit */
   94 
   95 thread_free_lock_t      *thread_free_lock;
   96                                         /* protects tick thread from reaper */
   97 
   98 extern int nthread;
   99 
  100 /* System Scheduling classes. */
  101 id_t    syscid;                         /* system scheduling class ID */
  102 id_t    sysdccid = CLASS_UNUSED;        /* reset when SDC loads */
  103 
  104 void    *segkp_thread;                  /* cookie for segkp pool */
  105 
  106 int lwp_cache_sz = 32;
  107 int t_cache_sz = 8;
  108 static kt_did_t next_t_id = 1;
  109 
  110 /* Default mode for thread binding to CPUs and processor sets */
  111 int default_binding_mode = TB_ALLHARD;
  112 
  113 /*
  114  * Min/Max stack sizes for stack size parameters
  115  */
  116 #define MAX_STKSIZE     (32 * DEFAULTSTKSZ)
  117 #define MIN_STKSIZE     DEFAULTSTKSZ
  118 
  119 /*
  120  * default_stksize overrides lwp_default_stksize if it is set.
  121  */
  122 int     default_stksize;
  123 int     lwp_default_stksize;
  124 
  125 static zone_key_t zone_thread_key;
  126 
  127 unsigned int kmem_stackinfo;            /* stackinfo feature on-off */
  128 kmem_stkinfo_t *kmem_stkinfo_log;       /* stackinfo circular log */
  129 static kmutex_t kmem_stkinfo_lock;      /* protects kmem_stkinfo_log */
  130 
  131 /*
  132  * forward declarations for internal thread specific data (tsd)
  133  */
  134 static void *tsd_realloc(void *, size_t, size_t);
  135 
  136 void thread_reaper(void);
  137 
  138 /* forward declarations for stackinfo feature */
  139 static void stkinfo_begin(kthread_t *);
  140 static void stkinfo_end(kthread_t *);
  141 static size_t stkinfo_percent(caddr_t, caddr_t, caddr_t);
  142 
  143 /*ARGSUSED*/
  144 static int
  145 turnstile_constructor(void *buf, void *cdrarg, int kmflags)
  146 {
  147         bzero(buf, sizeof (turnstile_t));
  148         return (0);
  149 }
  150 
  151 /*ARGSUSED*/
  152 static void
  153 turnstile_destructor(void *buf, void *cdrarg)
  154 {
  155         turnstile_t *ts = buf;
  156 
  157         ASSERT(ts->ts_free == NULL);
  158         ASSERT(ts->ts_waiters == 0);
  159         ASSERT(ts->ts_inheritor == NULL);
  160         ASSERT(ts->ts_sleepq[0].sq_first == NULL);
  161         ASSERT(ts->ts_sleepq[1].sq_first == NULL);
  162 }
  163 
  164 void
  165 thread_init(void)
  166 {
  167         kthread_t *tp;
  168         extern char sys_name[];
  169         extern void idle();
  170         struct cpu *cpu = CPU;
  171         int i;
  172         kmutex_t *lp;
  173 
  174         mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
  175         thread_free_lock =
  176             kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
  177         for (i = 0; i < THREAD_FREE_NUM; i++) {
  178                 lp = &thread_free_lock[i].tf_lock;
  179                 mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
  180         }
  181 
  182 #if defined(__i386) || defined(__amd64)
  183         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
  184             PTR24_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
  185 
  186         /*
  187          * "struct _klwp" includes a "struct pcb", which includes a
  188          * "struct fpu", which needs to be 64-byte aligned on amd64
  189          * (and even on i386) for xsave/xrstor.
  190          */
  191         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
  192             64, NULL, NULL, NULL, NULL, NULL, 0);
  193 #else
  194         /*
  195          * Allocate thread structures from static_arena.  This prevents
  196          * issues where a thread tries to relocate its own thread
  197          * structure and touches it after the mapping has been suspended.
  198          */
  199         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
  200             PTR24_ALIGN, NULL, NULL, NULL, NULL, static_arena, 0);
  201 
  202         lwp_stk_cache_init();
  203 
  204         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
  205             0, NULL, NULL, NULL, NULL, NULL, 0);
  206 #endif
  207 
  208         turnstile_cache = kmem_cache_create("turnstile_cache",
  209             sizeof (turnstile_t), 0,
  210             turnstile_constructor, turnstile_destructor, NULL, NULL, NULL, 0);
  211 
  212         label_init();
  213         cred_init();
  214 
  215         /*
  216          * Initialize various resource management facilities.
  217          */
  218         rctl_init();
  219         cpucaps_init();
  220         /*
  221          * Zone_init() should be called before project_init() so that project ID
  222          * for the first project is initialized correctly.
  223          */
  224         zone_init();
  225         project_init();
  226         brand_init();
  227         kiconv_init();
  228         task_init();
  229         tcache_init();
  230         pool_init();
  231 
  232         curthread->t_ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
  233 
  234         /*
  235          * Originally, we had two parameters to set default stack
  236          * size: one for lwp's (lwp_default_stksize), and one for
  237          * kernel-only threads (DEFAULTSTKSZ, a.k.a. _defaultstksz).
  238          * Now we have a third parameter that overrides both if it is
  239          * set to a legal stack size, called default_stksize.
  240          */
  241 
  242         if (default_stksize == 0) {
  243                 default_stksize = DEFAULTSTKSZ;
  244         } else if (default_stksize % PAGESIZE != 0 ||
  245             default_stksize > MAX_STKSIZE ||
  246             default_stksize < MIN_STKSIZE) {
  247                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
  248                     (int)DEFAULTSTKSZ);
  249                 default_stksize = DEFAULTSTKSZ;
  250         } else {
  251                 lwp_default_stksize = default_stksize;
  252         }
  253 
  254         if (lwp_default_stksize == 0) {
  255                 lwp_default_stksize = default_stksize;
  256         } else if (lwp_default_stksize % PAGESIZE != 0 ||
  257             lwp_default_stksize > MAX_STKSIZE ||
  258             lwp_default_stksize < MIN_STKSIZE) {
  259                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
  260                     default_stksize);
  261                 lwp_default_stksize = default_stksize;
  262         }
  263 
  264         segkp_lwp = segkp_cache_init(segkp, lwp_cache_sz,
  265             lwp_default_stksize,
  266             (KPD_NOWAIT | KPD_HASREDZONE | KPD_LOCKED));
  267 
  268         segkp_thread = segkp_cache_init(segkp, t_cache_sz,
  269             default_stksize, KPD_HASREDZONE | KPD_LOCKED | KPD_NO_ANON);
  270 
  271         (void) getcid(sys_name, &syscid);
  272         curthread->t_cid = syscid;      /* current thread is t0 */
  273 
  274         /*
  275          * Set up the first CPU's idle thread.
  276          * It runs whenever the CPU has nothing worthwhile to do.
  277          */
  278         tp = thread_create(NULL, 0, idle, NULL, 0, &p0, TS_STOPPED, -1);
  279         cpu->cpu_idle_thread = tp;
  280         tp->t_preempt = 1;
  281         tp->t_disp_queue = cpu->cpu_disp;
  282         ASSERT(tp->t_disp_queue != NULL);
  283         tp->t_bound_cpu = cpu;
  284         tp->t_affinitycnt = 1;
  285 
  286         /*
  287          * Registering a thread in the callback table is usually
  288          * done in the initialization code of the thread. In this
  289          * case, we do it right after thread creation to avoid
  290          * blocking idle thread while registering itself. It also
  291          * avoids the possibility of reregistration in case a CPU
  292          * restarts its idle thread.
  293          */
  294         CALLB_CPR_INIT_SAFE(tp, "idle");
  295 
  296         /*
  297          * Create the thread_reaper daemon. From this point on, exited
  298          * threads will get reaped.
  299          */
  300         (void) thread_create(NULL, 0, (void (*)())thread_reaper,
  301             NULL, 0, &p0, TS_RUN, minclsyspri);
  302 
  303         /*
  304          * Finish initializing the kernel memory allocator now that
  305          * thread_create() is available.
  306          */
  307         kmem_thread_init();
  308 
  309         if (boothowto & RB_DEBUG)
  310                 kdi_dvec_thravail();
  311 }
  312 
  313 /*
  314  * Create a thread.
  315  *
  316  * thread_create() blocks for memory if necessary.  It never fails.
  317  *
  318  * If stk is NULL, the thread is created at the base of the stack
  319  * and cannot be swapped.
  320  */
  321 kthread_t *
  322 thread_create(
  323         caddr_t stk,
  324         size_t  stksize,
  325         void    (*proc)(),
  326         void    *arg,
  327         size_t  len,
  328         proc_t   *pp,
  329         int     state,
  330         pri_t   pri)
  331 {
  332         kthread_t *t;
  333         extern struct classfuncs sys_classfuncs;
  334         turnstile_t *ts;
  335 
  336         /*
  337          * Every thread keeps a turnstile around in case it needs to block.
  338          * The only reason the turnstile is not simply part of the thread
  339          * structure is that we may have to break the association whenever
  340          * more than one thread blocks on a given synchronization object.
  341          * From a memory-management standpoint, turnstiles are like the
  342          * "attached mblks" that hang off dblks in the streams allocator.
  343          */
  344         ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
  345 
  346         if (stk == NULL) {
  347                 /*
  348                  * alloc both thread and stack in segkp chunk
  349                  */
  350 
  351                 if (stksize < default_stksize)
  352                         stksize = default_stksize;
  353 
  354                 if (stksize == default_stksize) {
  355                         stk = (caddr_t)segkp_cache_get(segkp_thread);
  356                 } else {
  357                         stksize = roundup(stksize, PAGESIZE);
  358                         stk = (caddr_t)segkp_get(segkp, stksize,
  359                             (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
  360                 }
  361 
  362                 ASSERT(stk != NULL);
  363 
  364                 /*
  365                  * The machine-dependent mutex code may require that
  366                  * thread pointers (since they may be used for mutex owner
  367                  * fields) have certain alignment requirements.
  368                  * PTR24_ALIGN is the size of the alignment quanta.
  369                  * XXX - assumes stack grows toward low addresses.
  370                  */
  371                 if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
  372                         cmn_err(CE_PANIC, "thread_create: proposed stack size"
  373                             " too small to hold thread.");
  374 #ifdef STACK_GROWTH_DOWN
  375                 stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
  376                 stksize &= -PTR24_ALIGN;        /* make thread aligned */
  377                 t = (kthread_t *)(stk + stksize);
  378                 bzero(t, sizeof (kthread_t));
  379                 if (audit_active)
  380                         audit_thread_create(t);
  381                 t->t_stk = stk + stksize;
  382                 t->t_stkbase = stk;
  383 #else   /* stack grows to larger addresses */
  384                 stksize -= SA(sizeof (kthread_t));
  385                 t = (kthread_t *)(stk);
  386                 bzero(t, sizeof (kthread_t));
  387                 t->t_stk = stk + sizeof (kthread_t);
  388                 t->t_stkbase = stk + stksize + sizeof (kthread_t);
  389 #endif  /* STACK_GROWTH_DOWN */
  390                 t->t_flag |= T_TALLOCSTK;
  391                 t->t_swap = stk;
  392         } else {
  393                 t = kmem_cache_alloc(thread_cache, KM_SLEEP);
  394                 bzero(t, sizeof (kthread_t));
  395                 ASSERT(((uintptr_t)t & (PTR24_ALIGN - 1)) == 0);
  396                 if (audit_active)
  397                         audit_thread_create(t);
  398                 /*
  399                  * Initialize t_stk to the kernel stack pointer to use
  400                  * upon entry to the kernel
  401                  */
  402 #ifdef STACK_GROWTH_DOWN
  403                 t->t_stk = stk + stksize;
  404                 t->t_stkbase = stk;
  405 #else
  406                 t->t_stk = stk;                 /* 3b2-like */
  407                 t->t_stkbase = stk + stksize;
  408 #endif /* STACK_GROWTH_DOWN */
  409         }
  410 
  411         if (kmem_stackinfo != 0) {
  412                 stkinfo_begin(t);
  413         }
  414 
  415         t->t_ts = ts;
  416 
  417         /*
  418          * p_cred could be NULL if it thread_create is called before cred_init
  419          * is called in main.
  420          */
  421         mutex_enter(&pp->p_crlock);
  422         if (pp->p_cred)
  423                 crhold(t->t_cred = pp->p_cred);
  424         mutex_exit(&pp->p_crlock);
  425         t->t_start = gethrestime_sec();
  426         t->t_startpc = proc;
  427         t->t_procp = pp;
  428         t->t_clfuncs = &sys_classfuncs.thread;
  429         t->t_cid = syscid;
  430         t->t_pri = pri;
  431         t->t_stime = ddi_get_lbolt();
  432         t->t_schedflag = TS_LOAD | TS_DONT_SWAP;
  433         t->t_bind_cpu = PBIND_NONE;
  434         t->t_bindflag = (uchar_t)default_binding_mode;
  435         t->t_bind_pset = PS_NONE;
  436         t->t_plockp = &pp->p_lock;
  437         t->t_copyops = NULL;
  438         t->t_taskq = NULL;
  439         t->t_anttime = 0;
  440         t->t_hatdepth = 0;
  441 
  442         t->t_dtrace_vtime = 1;  /* assure vtimestamp is always non-zero */
  443 
  444         CPU_STATS_ADDQ(CPU, sys, nthreads, 1);
  445 #ifndef NPROBE
  446         /* Kernel probe */
  447         tnf_thread_create(t);
  448 #endif /* NPROBE */
  449         LOCK_INIT_CLEAR(&t->t_lock);
  450 
  451         /*
  452          * Callers who give us a NULL proc must do their own
  453          * stack initialization.  e.g. lwp_create()
  454          */
  455         if (proc != NULL) {
  456                 t->t_stk = thread_stk_init(t->t_stk);
  457                 thread_load(t, proc, arg, len);
  458         }
  459 
  460         /*
  461          * Put a hold on project0. If this thread is actually in a
  462          * different project, then t_proj will be changed later in
  463          * lwp_create().  All kernel-only threads must be in project 0.
  464          */
  465         t->t_proj = project_hold(proj0p);
  466 
  467         lgrp_affinity_init(&t->t_lgrp_affinity);
  468 
  469         mutex_enter(&pidlock);
  470         nthread++;
  471         t->t_did = next_t_id++;
  472         t->t_prev = curthread->t_prev;
  473         t->t_next = curthread;
  474 
  475         /*
  476          * Add the thread to the list of all threads, and initialize
  477          * its t_cpu pointer.  We need to block preemption since
  478          * cpu_offline walks the thread list looking for threads
  479          * with t_cpu pointing to the CPU being offlined.  We want
  480          * to make sure that the list is consistent and that if t_cpu
  481          * is set, the thread is on the list.
  482          */
  483         kpreempt_disable();
  484         curthread->t_prev->t_next = t;
  485         curthread->t_prev = t;
  486 
  487         /*
  488          * Threads should never have a NULL t_cpu pointer so assign it
  489          * here.  If the thread is being created with state TS_RUN a
  490          * better CPU may be chosen when it is placed on the run queue.
  491          *
  492          * We need to keep kernel preemption disabled when setting all
  493          * three fields to keep them in sync.  Also, always create in
  494          * the default partition since that's where kernel threads go
  495          * (if this isn't a kernel thread, t_cpupart will be changed
  496          * in lwp_create before setting the thread runnable).
  497          */
  498         t->t_cpupart = &cp_default;
  499 
  500         /*
  501          * For now, affiliate this thread with the root lgroup.
  502          * Since the kernel does not (presently) allocate its memory
  503          * in a locality aware fashion, the root is an appropriate home.
  504          * If this thread is later associated with an lwp, it will have
  505          * it's lgroup re-assigned at that time.
  506          */
  507         lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
  508 
  509         /*
  510          * Inherit the current cpu.  If this cpu isn't part of the chosen
  511          * lgroup, a new cpu will be chosen by cpu_choose when the thread
  512          * is ready to run.
  513          */
  514         if (CPU->cpu_part == &cp_default)
  515                 t->t_cpu = CPU;
  516         else
  517                 t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
  518                     t->t_pri, NULL);
  519 
  520         t->t_disp_queue = t->t_cpu->cpu_disp;
  521         kpreempt_enable();
  522 
  523         /*
  524          * Initialize thread state and the dispatcher lock pointer.
  525          * Need to hold onto pidlock to block allthreads walkers until
  526          * the state is set.
  527          */
  528         switch (state) {
  529         case TS_RUN:
  530                 curthread->t_oldspl = splhigh();        /* get dispatcher spl */
  531                 THREAD_SET_STATE(t, TS_STOPPED, &transition_lock);
  532                 CL_SETRUN(t);
  533                 thread_unlock(t);
  534                 break;
  535 
  536         case TS_ONPROC:
  537                 THREAD_ONPROC(t, t->t_cpu);
  538                 break;
  539 
  540         case TS_FREE:
  541                 /*
  542                  * Free state will be used for intr threads.
  543                  * The interrupt routine must set the thread dispatcher
  544                  * lock pointer (t_lockp) if starting on a CPU
  545                  * other than the current one.
  546                  */
  547                 THREAD_FREEINTR(t, CPU);
  548                 break;
  549 
  550         case TS_STOPPED:
  551                 THREAD_SET_STATE(t, TS_STOPPED, &stop_lock);
  552                 break;
  553 
  554         default:                        /* TS_SLEEP, TS_ZOMB or TS_TRANS */
  555                 cmn_err(CE_PANIC, "thread_create: invalid state %d", state);
  556         }
  557         mutex_exit(&pidlock);
  558         return (t);
  559 }
  560 
  561 /*
  562  * Move thread to project0 and take care of project reference counters.
  563  */
  564 void
  565 thread_rele(kthread_t *t)
  566 {
  567         kproject_t *kpj;
  568 
  569         thread_lock(t);
  570 
  571         ASSERT(t == curthread || t->t_state == TS_FREE || t->t_procp == &p0);
  572         kpj = ttoproj(t);
  573         t->t_proj = proj0p;
  574 
  575         thread_unlock(t);
  576 
  577         if (kpj != proj0p) {
  578                 project_rele(kpj);
  579                 (void) project_hold(proj0p);
  580         }
  581 }
  582 
  583 void
  584 thread_exit(void)
  585 {
  586         kthread_t *t = curthread;
  587 
  588         if ((t->t_proc_flag & TP_ZTHREAD) != 0)
  589                 cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
  590 
  591         tsd_exit();             /* Clean up this thread's TSD */
  592 
  593         kcpc_passivate();       /* clean up performance counter state */
  594 
  595         /*
  596          * No kernel thread should have called poll() without arranging
  597          * calling pollcleanup() here.
  598          */
  599         ASSERT(t->t_pollstate == NULL);
  600         ASSERT(t->t_schedctl == NULL);
  601         if (t->t_door)
  602                 door_slam();    /* in case thread did an upcall */
  603 
  604 #ifndef NPROBE
  605         /* Kernel probe */
  606         if (t->t_tnf_tpdp)
  607                 tnf_thread_exit();
  608 #endif /* NPROBE */
  609 
  610         thread_rele(t);
  611         t->t_preempt++;
  612 
  613         /*
  614          * remove thread from the all threads list so that
  615          * death-row can use the same pointers.
  616          */
  617         mutex_enter(&pidlock);
  618         t->t_next->t_prev = t->t_prev;
  619         t->t_prev->t_next = t->t_next;
  620         ASSERT(allthreads != t);        /* t0 never exits */
  621         cv_broadcast(&t->t_joincv);     /* wake up anyone in thread_join */
  622         mutex_exit(&pidlock);
  623 
  624         if (t->t_ctx != NULL)
  625                 exitctx(t);
  626         if (t->t_procp->p_pctx != NULL)
  627                 exitpctx(t->t_procp);
  628 
  629         if (kmem_stackinfo != 0) {
  630                 stkinfo_end(t);
  631         }
  632 
  633         t->t_state = TS_ZOMB;   /* set zombie thread */
  634 
  635         swtch_from_zombie();    /* give up the CPU */
  636         /* NOTREACHED */
  637 }
  638 
  639 /*
  640  * Check to see if the specified thread is active (defined as being on
  641  * the thread list).  This is certainly a slow way to do this; if there's
  642  * ever a reason to speed it up, we could maintain a hash table of active
  643  * threads indexed by their t_did.
  644  */
  645 static kthread_t *
  646 did_to_thread(kt_did_t tid)
  647 {
  648         kthread_t *t;
  649 
  650         ASSERT(MUTEX_HELD(&pidlock));
  651         for (t = curthread->t_next; t != curthread; t = t->t_next) {
  652                 if (t->t_did == tid)
  653                         break;
  654         }
  655         if (t->t_did == tid)
  656                 return (t);
  657         else
  658                 return (NULL);
  659 }
  660 
  661 /*
  662  * Wait for specified thread to exit.  Returns immediately if the thread
  663  * could not be found, meaning that it has either already exited or never
  664  * existed.
  665  */
  666 void
  667 thread_join(kt_did_t tid)
  668 {
  669         kthread_t *t;
  670 
  671         ASSERT(tid != curthread->t_did);
  672         ASSERT(tid != t0.t_did);
  673 
  674         mutex_enter(&pidlock);
  675         /*
  676          * Make sure we check that the thread is on the thread list
  677          * before blocking on it; otherwise we could end up blocking on
  678          * a cv that's already been freed.  In other words, don't cache
  679          * the thread pointer across calls to cv_wait.
  680          *
  681          * The choice of loop invariant means that whenever a thread
  682          * is taken off the allthreads list, a cv_broadcast must be
  683          * performed on that thread's t_joincv to wake up any waiters.
  684          * The broadcast doesn't have to happen right away, but it
  685          * shouldn't be postponed indefinitely (e.g., by doing it in
  686          * thread_free which may only be executed when the deathrow
  687          * queue is processed.
  688          */
  689         while (t = did_to_thread(tid))
  690                 cv_wait(&t->t_joincv, &pidlock);
  691         mutex_exit(&pidlock);
  692 }
  693 
  694 void
  695 thread_free_prevent(kthread_t *t)
  696 {
  697         kmutex_t *lp;
  698 
  699         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
  700         mutex_enter(lp);
  701 }
  702 
  703 void
  704 thread_free_allow(kthread_t *t)
  705 {
  706         kmutex_t *lp;
  707 
  708         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
  709         mutex_exit(lp);
  710 }
  711 
  712 static void
  713 thread_free_barrier(kthread_t *t)
  714 {
  715         kmutex_t *lp;
  716 
  717         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
  718         mutex_enter(lp);
  719         mutex_exit(lp);
  720 }
  721 
  722 void
  723 thread_free(kthread_t *t)
  724 {
  725         boolean_t allocstk = (t->t_flag & T_TALLOCSTK);
  726         klwp_t *lwp = t->t_lwp;
  727         caddr_t swap = t->t_swap;
  728 
  729         ASSERT(t != &t0 && t->t_state == TS_FREE);
  730         ASSERT(t->t_door == NULL);
  731         ASSERT(t->t_schedctl == NULL);
  732         ASSERT(t->t_pollstate == NULL);
  733 
  734         t->t_pri = 0;
  735         t->t_pc = 0;
  736         t->t_sp = 0;
  737         t->t_wchan0 = NULL;
  738         t->t_wchan = NULL;
  739         if (t->t_cred != NULL) {
  740                 crfree(t->t_cred);
  741                 t->t_cred = 0;
  742         }
  743         if (t->t_pdmsg) {
  744                 kmem_free(t->t_pdmsg, strlen(t->t_pdmsg) + 1);
  745                 t->t_pdmsg = NULL;
  746         }
  747         if (audit_active)
  748                 audit_thread_free(t);
  749 #ifndef NPROBE
  750         if (t->t_tnf_tpdp)
  751                 tnf_thread_free(t);
  752 #endif /* NPROBE */
  753         if (t->t_cldata) {
  754                 CL_EXITCLASS(t->t_cid, (caddr_t *)t->t_cldata);
  755         }
  756         if (t->t_rprof != NULL) {
  757                 kmem_free(t->t_rprof, sizeof (*t->t_rprof));
  758                 t->t_rprof = NULL;
  759         }
  760         t->t_lockp = NULL;      /* nothing should try to lock this thread now */
  761         if (lwp)
  762                 lwp_freeregs(lwp, 0);
  763         if (t->t_ctx)
  764                 freectx(t, 0);
  765         t->t_stk = NULL;
  766         if (lwp)
  767                 lwp_stk_fini(lwp);
  768         lock_clear(&t->t_lock);
  769 
  770         if (t->t_ts->ts_waiters > 0)
  771                 panic("thread_free: turnstile still active");
  772 
  773         kmem_cache_free(turnstile_cache, t->t_ts);
  774 
  775         free_afd(&t->t_activefd);
  776 
  777         /*
  778          * Barrier for the tick accounting code.  The tick accounting code
  779          * holds this lock to keep the thread from going away while it's
  780          * looking at it.
  781          */
  782         thread_free_barrier(t);
  783 
  784         ASSERT(ttoproj(t) == proj0p);
  785         project_rele(ttoproj(t));
  786 
  787         lgrp_affinity_free(&t->t_lgrp_affinity);
  788 
  789         mutex_enter(&pidlock);
  790         nthread--;
  791         mutex_exit(&pidlock);
  792 
  793         /*
  794          * Free thread, lwp and stack.  This needs to be done carefully, since
  795          * if T_TALLOCSTK is set, the thread is part of the stack.
  796          */
  797         t->t_lwp = NULL;
  798         t->t_swap = NULL;
  799 
  800         if (swap) {
  801                 segkp_release(segkp, swap);
  802         }
  803         if (lwp) {
  804                 kmem_cache_free(lwp_cache, lwp);
  805         }
  806         if (!allocstk) {
  807                 kmem_cache_free(thread_cache, t);
  808         }
  809 }
  810 
  811 /*
  812  * Removes threads associated with the given zone from a deathrow queue.
  813  * tp is a pointer to the head of the deathrow queue, and countp is a
  814  * pointer to the current deathrow count.  Returns a linked list of
  815  * threads removed from the list.
  816  */
  817 static kthread_t *
  818 thread_zone_cleanup(kthread_t **tp, int *countp, zoneid_t zoneid)
  819 {
  820         kthread_t *tmp, *list = NULL;
  821         cred_t *cr;
  822 
  823         ASSERT(MUTEX_HELD(&reaplock));
  824         while (*tp != NULL) {
  825                 if ((cr = (*tp)->t_cred) != NULL && crgetzoneid(cr) == zoneid) {
  826                         tmp = *tp;
  827                         *tp = tmp->t_forw;
  828                         tmp->t_forw = list;
  829                         list = tmp;
  830                         (*countp)--;
  831                 } else {
  832                         tp = &(*tp)->t_forw;
  833                 }
  834         }
  835         return (list);
  836 }
  837 
  838 static void
  839 thread_reap_list(kthread_t *t)
  840 {
  841         kthread_t *next;
  842 
  843         while (t != NULL) {
  844                 next = t->t_forw;
  845                 thread_free(t);
  846                 t = next;
  847         }
  848 }
  849 
  850 /* ARGSUSED */
  851 static void
  852 thread_zone_destroy(zoneid_t zoneid, void *unused)
  853 {
  854         kthread_t *t, *l;
  855 
  856         mutex_enter(&reaplock);
  857         /*
  858          * Pull threads and lwps associated with zone off deathrow lists.
  859          */
  860         t = thread_zone_cleanup(&thread_deathrow, &thread_reapcnt, zoneid);
  861         l = thread_zone_cleanup(&lwp_deathrow, &lwp_reapcnt, zoneid);
  862         mutex_exit(&reaplock);
  863 
  864         /*
  865          * Guard against race condition in mutex_owner_running:
  866          *      thread=owner(mutex)
  867          *      <interrupt>
  868          *                              thread exits mutex
  869          *                              thread exits
  870          *                              thread reaped
  871          *                              thread struct freed
  872          * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
  873          * A cross call to all cpus will cause the interrupt handler
  874          * to reset the PC if it is in mutex_owner_running, refreshing
  875          * stale thread pointers.
  876          */
  877         mutex_sync();   /* sync with mutex code */
  878 
  879         /*
  880          * Reap threads
  881          */
  882         thread_reap_list(t);
  883 
  884         /*
  885          * Reap lwps
  886          */
  887         thread_reap_list(l);
  888 }
  889 
  890 /*
  891  * cleanup zombie threads that are on deathrow.
  892  */
  893 void
  894 thread_reaper()
  895 {
  896         kthread_t *t, *l;
  897         callb_cpr_t cprinfo;
  898 
  899         /*
  900          * Register callback to clean up threads when zone is destroyed.
  901          */
  902         zone_key_create(&zone_thread_key, NULL, NULL, thread_zone_destroy);
  903 
  904         CALLB_CPR_INIT(&cprinfo, &reaplock, callb_generic_cpr, "t_reaper");
  905         for (;;) {
  906                 mutex_enter(&reaplock);
  907                 while (thread_deathrow == NULL && lwp_deathrow == NULL) {
  908                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
  909                         cv_wait(&reaper_cv, &reaplock);
  910                         CALLB_CPR_SAFE_END(&cprinfo, &reaplock);
  911                 }
  912                 /*
  913                  * mutex_sync() needs to be called when reaping, but
  914                  * not too often.  We limit reaping rate to once
  915                  * per second.  Reaplimit is max rate at which threads can
  916                  * be freed. Does not impact thread destruction/creation.
  917                  */
  918                 t = thread_deathrow;
  919                 l = lwp_deathrow;
  920                 thread_deathrow = NULL;
  921                 lwp_deathrow = NULL;
  922                 thread_reapcnt = 0;
  923                 lwp_reapcnt = 0;
  924                 mutex_exit(&reaplock);
  925 
  926                 /*
  927                  * Guard against race condition in mutex_owner_running:
  928                  *      thread=owner(mutex)
  929                  *      <interrupt>
  930                  *                              thread exits mutex
  931                  *                              thread exits
  932                  *                              thread reaped
  933                  *                              thread struct freed
  934                  * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
  935                  * A cross call to all cpus will cause the interrupt handler
  936                  * to reset the PC if it is in mutex_owner_running, refreshing
  937                  * stale thread pointers.
  938                  */
  939                 mutex_sync();   /* sync with mutex code */
  940                 /*
  941                  * Reap threads
  942                  */
  943                 thread_reap_list(t);
  944 
  945                 /*
  946                  * Reap lwps
  947                  */
  948                 thread_reap_list(l);
  949                 delay(hz);
  950         }
  951 }
  952 
  953 /*
  954  * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto
  955  * thread_deathrow. The thread's state is changed already TS_FREE to indicate
  956  * that is reapable. The thread already holds the reaplock, and was already
  957  * freed.
  958  */
  959 void
  960 reapq_move_lq_to_tq(kthread_t *t)
  961 {
  962         ASSERT(t->t_state == TS_FREE);
  963         ASSERT(MUTEX_HELD(&reaplock));
  964         t->t_forw = thread_deathrow;
  965         thread_deathrow = t;
  966         thread_reapcnt++;
  967         if (lwp_reapcnt + thread_reapcnt > reaplimit)
  968                 cv_signal(&reaper_cv);  /* wake the reaper */
  969 }
  970 
  971 /*
  972  * This is called by resume() to put a zombie thread onto deathrow.
  973  * The thread's state is changed to TS_FREE to indicate that is reapable.
  974  * This is called from the idle thread so it must not block - just spin.
  975  */
  976 void
  977 reapq_add(kthread_t *t)
  978 {
  979         mutex_enter(&reaplock);
  980 
  981         /*
  982          * lwp_deathrow contains threads with lwp linkage and
  983          * swappable thread stacks which have the default stacksize.
  984          * These threads' lwps and stacks may be reused by lwp_create().
  985          *
  986          * Anything else goes on thread_deathrow(), where it will eventually
  987          * be thread_free()d.
  988          */
  989         if (t->t_flag & T_LWPREUSE) {
  990                 ASSERT(ttolwp(t) != NULL);
  991                 t->t_forw = lwp_deathrow;
  992                 lwp_deathrow = t;
  993                 lwp_reapcnt++;
  994         } else {
  995                 t->t_forw = thread_deathrow;
  996                 thread_deathrow = t;
  997                 thread_reapcnt++;
  998         }
  999         if (lwp_reapcnt + thread_reapcnt > reaplimit)
 1000                 cv_signal(&reaper_cv);  /* wake the reaper */
 1001         t->t_state = TS_FREE;
 1002         lock_clear(&t->t_lock);
 1003 
 1004         /*
 1005          * Before we return, we need to grab and drop the thread lock for
 1006          * the dead thread.  At this point, the current thread is the idle
 1007          * thread, and the dead thread's CPU lock points to the current
 1008          * CPU -- and we must grab and drop the lock to synchronize with
 1009          * a racing thread walking a blocking chain that the zombie thread
 1010          * was recently in.  By this point, that blocking chain is (by
 1011          * definition) stale:  the dead thread is not holding any locks, and
 1012          * is therefore not in any blocking chains -- but if we do not regrab
 1013          * our lock before freeing the dead thread's data structures, the
 1014          * thread walking the (stale) blocking chain will die on memory
 1015          * corruption when it attempts to drop the dead thread's lock.  We
 1016          * only need do this once because there is no way for the dead thread
 1017          * to ever again be on a blocking chain:  once we have grabbed and
 1018          * dropped the thread lock, we are guaranteed that anyone that could
 1019          * have seen this thread in a blocking chain can no longer see it.
 1020          */
 1021         thread_lock(t);
 1022         thread_unlock(t);
 1023 
 1024         mutex_exit(&reaplock);
 1025 }
 1026 
 1027 /*
 1028  * Install thread context ops for the current thread.
 1029  */
 1030 void
 1031 installctx(
 1032         kthread_t *t,
 1033         void    *arg,
 1034         void    (*save)(void *),
 1035         void    (*restore)(void *),
 1036         void    (*fork)(void *, void *),
 1037         void    (*lwp_create)(void *, void *),
 1038         void    (*exit)(void *),
 1039         void    (*free)(void *, int))
 1040 {
 1041         struct ctxop *ctx;
 1042 
 1043         ctx = kmem_alloc(sizeof (struct ctxop), KM_SLEEP);
 1044         ctx->save_op = save;
 1045         ctx->restore_op = restore;
 1046         ctx->fork_op = fork;
 1047         ctx->lwp_create_op = lwp_create;
 1048         ctx->exit_op = exit;
 1049         ctx->free_op = free;
 1050         ctx->arg = arg;
 1051         ctx->next = t->t_ctx;
 1052         t->t_ctx = ctx;
 1053 }
 1054 
 1055 /*
 1056  * Remove the thread context ops from a thread.
 1057  */
 1058 int
 1059 removectx(
 1060         kthread_t *t,
 1061         void    *arg,
 1062         void    (*save)(void *),
 1063         void    (*restore)(void *),
 1064         void    (*fork)(void *, void *),
 1065         void    (*lwp_create)(void *, void *),
 1066         void    (*exit)(void *),
 1067         void    (*free)(void *, int))
 1068 {
 1069         struct ctxop *ctx, *prev_ctx;
 1070 
 1071         /*
 1072          * The incoming kthread_t (which is the thread for which the
 1073          * context ops will be removed) should be one of the following:
 1074          *
 1075          * a) the current thread,
 1076          *
 1077          * b) a thread of a process that's being forked (SIDL),
 1078          *
 1079          * c) a thread that belongs to the same process as the current
 1080          *    thread and for which the current thread is the agent thread,
 1081          *
 1082          * d) a thread that is TS_STOPPED which is indicative of it
 1083          *    being (if curthread is not an agent) a thread being created
 1084          *    as part of an lwp creation.
 1085          */
 1086         ASSERT(t == curthread || ttoproc(t)->p_stat == SIDL ||
 1087             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
 1088 
 1089         /*
 1090          * Serialize modifications to t->t_ctx to prevent the agent thread
 1091          * and the target thread from racing with each other during lwp exit.
 1092          */
 1093         mutex_enter(&t->t_ctx_lock);
 1094         prev_ctx = NULL;
 1095         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
 1096                 if (ctx->save_op == save && ctx->restore_op == restore &&
 1097                     ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
 1098                     ctx->exit_op == exit && ctx->free_op == free &&
 1099                     ctx->arg == arg) {
 1100                         if (prev_ctx)
 1101                                 prev_ctx->next = ctx->next;
 1102                         else
 1103                                 t->t_ctx = ctx->next;
 1104                         mutex_exit(&t->t_ctx_lock);
 1105                         if (ctx->free_op != NULL)
 1106                                 (ctx->free_op)(ctx->arg, 0);
 1107                         kmem_free(ctx, sizeof (struct ctxop));
 1108                         return (1);
 1109                 }
 1110                 prev_ctx = ctx;
 1111         }
 1112         mutex_exit(&t->t_ctx_lock);
 1113 
 1114         return (0);
 1115 }
 1116 
 1117 void
 1118 savectx(kthread_t *t)
 1119 {
 1120         struct ctxop *ctx;
 1121 
 1122         ASSERT(t == curthread);
 1123         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
 1124                 if (ctx->save_op != NULL)
 1125                         (ctx->save_op)(ctx->arg);
 1126 }
 1127 
 1128 void
 1129 restorectx(kthread_t *t)
 1130 {
 1131         struct ctxop *ctx;
 1132 
 1133         ASSERT(t == curthread);
 1134         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
 1135                 if (ctx->restore_op != NULL)
 1136                         (ctx->restore_op)(ctx->arg);
 1137 }
 1138 
 1139 void
 1140 forkctx(kthread_t *t, kthread_t *ct)
 1141 {
 1142         struct ctxop *ctx;
 1143 
 1144         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
 1145                 if (ctx->fork_op != NULL)
 1146                         (ctx->fork_op)(t, ct);
 1147 }
 1148 
 1149 /*
 1150  * Note that this operator is only invoked via the _lwp_create
 1151  * system call.  The system may have other reasons to create lwps
 1152  * e.g. the agent lwp or the doors unreferenced lwp.
 1153  */
 1154 void
 1155 lwp_createctx(kthread_t *t, kthread_t *ct)
 1156 {
 1157         struct ctxop *ctx;
 1158 
 1159         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
 1160                 if (ctx->lwp_create_op != NULL)
 1161                         (ctx->lwp_create_op)(t, ct);
 1162 }
 1163 
 1164 /*
 1165  * exitctx is called from thread_exit() and lwp_exit() to perform any actions
 1166  * needed when the thread/LWP leaves the processor for the last time. This
 1167  * routine is not intended to deal with freeing memory; freectx() is used for
 1168  * that purpose during thread_free(). This routine is provided to allow for
 1169  * clean-up that can't wait until thread_free().
 1170  */
 1171 void
 1172 exitctx(kthread_t *t)
 1173 {
 1174         struct ctxop *ctx;
 1175 
 1176         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
 1177                 if (ctx->exit_op != NULL)
 1178                         (ctx->exit_op)(t);
 1179 }
 1180 
 1181 /*
 1182  * freectx is called from thread_free() and exec() to get
 1183  * rid of old thread context ops.
 1184  */
 1185 void
 1186 freectx(kthread_t *t, int isexec)
 1187 {
 1188         struct ctxop *ctx;
 1189 
 1190         while ((ctx = t->t_ctx) != NULL) {
 1191                 t->t_ctx = ctx->next;
 1192                 if (ctx->free_op != NULL)
 1193                         (ctx->free_op)(ctx->arg, isexec);
 1194                 kmem_free(ctx, sizeof (struct ctxop));
 1195         }
 1196 }
 1197 
 1198 /*
 1199  * freectx_ctx is called from lwp_create() when lwp is reused from
 1200  * lwp_deathrow and its thread structure is added to thread_deathrow.
 1201  * The thread structure to which this ctx was attached may be already
 1202  * freed by the thread reaper so free_op implementations shouldn't rely
 1203  * on thread structure to which this ctx was attached still being around.
 1204  */
 1205 void
 1206 freectx_ctx(struct ctxop *ctx)
 1207 {
 1208         struct ctxop *nctx;
 1209 
 1210         ASSERT(ctx != NULL);
 1211 
 1212         do {
 1213                 nctx = ctx->next;
 1214                 if (ctx->free_op != NULL)
 1215                         (ctx->free_op)(ctx->arg, 0);
 1216                 kmem_free(ctx, sizeof (struct ctxop));
 1217         } while ((ctx = nctx) != NULL);
 1218 }
 1219 
 1220 /*
 1221  * Set the thread running; arrange for it to be swapped in if necessary.
 1222  */
 1223 void
 1224 setrun_locked(kthread_t *t)
 1225 {
 1226         ASSERT(THREAD_LOCK_HELD(t));
 1227         if (t->t_state == TS_SLEEP) {
 1228                 /*
 1229                  * Take off sleep queue.
 1230                  */
 1231                 SOBJ_UNSLEEP(t->t_sobj_ops, t);
 1232         } else if (t->t_state & (TS_RUN | TS_ONPROC)) {
 1233                 /*
 1234                  * Already on dispatcher queue.
 1235                  */
 1236                 return;
 1237         } else if (t->t_state == TS_WAIT) {
 1238                 waitq_setrun(t);
 1239         } else if (t->t_state == TS_STOPPED) {
 1240                 /*
 1241                  * All of the sending of SIGCONT (TC_XSTART) and /proc
 1242                  * (TC_PSTART) and lwp_continue() (TC_CSTART) must have
 1243                  * requested that the thread be run.
 1244                  * Just calling setrun() is not sufficient to set a stopped
 1245                  * thread running.  TP_TXSTART is always set if the thread
 1246                  * is not stopped by a jobcontrol stop signal.
 1247                  * TP_TPSTART is always set if /proc is not controlling it.
 1248                  * TP_TCSTART is always set if lwp_suspend() didn't stop it.
 1249                  * The thread won't be stopped unless one of these
 1250                  * three mechanisms did it.
 1251                  *
 1252                  * These flags must be set before calling setrun_locked(t).
 1253                  * They can't be passed as arguments because the streams
 1254                  * code calls setrun() indirectly and the mechanism for
 1255                  * doing so admits only one argument.  Note that the
 1256                  * thread must be locked in order to change t_schedflags.
 1257                  */
 1258                 if ((t->t_schedflag & TS_ALLSTART) != TS_ALLSTART)
 1259                         return;
 1260                 /*
 1261                  * Process is no longer stopped (a thread is running).
 1262                  */
 1263                 t->t_whystop = 0;
 1264                 t->t_whatstop = 0;
 1265                 /*
 1266                  * Strictly speaking, we do not have to clear these
 1267                  * flags here; they are cleared on entry to stop().
 1268                  * However, they are confusing when doing kernel
 1269                  * debugging or when they are revealed by ps(1).
 1270                  */
 1271                 t->t_schedflag &= ~TS_ALLSTART;
 1272                 THREAD_TRANSITION(t);   /* drop stopped-thread lock */
 1273                 ASSERT(t->t_lockp == &transition_lock);
 1274                 ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
 1275                 /*
 1276                  * Let the class put the process on the dispatcher queue.
 1277                  */
 1278                 CL_SETRUN(t);
 1279         }
 1280 }
 1281 
 1282 void
 1283 setrun(kthread_t *t)
 1284 {
 1285         thread_lock(t);
 1286         setrun_locked(t);
 1287         thread_unlock(t);
 1288 }
 1289 
 1290 /*
 1291  * Unpin an interrupted thread.
 1292  *      When an interrupt occurs, the interrupt is handled on the stack
 1293  *      of an interrupt thread, taken from a pool linked to the CPU structure.
 1294  *
 1295  *      When swtch() is switching away from an interrupt thread because it
 1296  *      blocked or was preempted, this routine is called to complete the
 1297  *      saving of the interrupted thread state, and returns the interrupted
 1298  *      thread pointer so it may be resumed.
 1299  *
 1300  *      Called by swtch() only at high spl.
 1301  */
 1302 kthread_t *
 1303 thread_unpin()
 1304 {
 1305         kthread_t       *t = curthread; /* current thread */
 1306         kthread_t       *itp;           /* interrupted thread */
 1307         int             i;              /* interrupt level */
 1308         extern int      intr_passivate();
 1309 
 1310         ASSERT(t->t_intr != NULL);
 1311 
 1312         itp = t->t_intr;                /* interrupted thread */
 1313         t->t_intr = NULL;               /* clear interrupt ptr */
 1314 
 1315         /*
 1316          * Get state from interrupt thread for the one
 1317          * it interrupted.
 1318          */
 1319 
 1320         i = intr_passivate(t, itp);
 1321 
 1322         TRACE_5(TR_FAC_INTR, TR_INTR_PASSIVATE,
 1323             "intr_passivate:level %d curthread %p (%T) ithread %p (%T)",
 1324             i, t, t, itp, itp);
 1325 
 1326         /*
 1327          * Dissociate the current thread from the interrupted thread's LWP.
 1328          */
 1329         t->t_lwp = NULL;
 1330 
 1331         /*
 1332          * Interrupt handlers above the level that spinlocks block must
 1333          * not block.
 1334          */
 1335 #if DEBUG
 1336         if (i < 0 || i > LOCK_LEVEL)
 1337                 cmn_err(CE_PANIC, "thread_unpin: ipl out of range %x", i);
 1338 #endif
 1339 
 1340         /*
 1341          * Compute the CPU's base interrupt level based on the active
 1342          * interrupts.
 1343          */
 1344         ASSERT(CPU->cpu_intr_actv & (1 << i));
 1345         set_base_spl();
 1346 
 1347         return (itp);
 1348 }
 1349 
 1350 /*
 1351  * Create and initialize an interrupt thread.
 1352  *      Returns non-zero on error.
 1353  *      Called at spl7() or better.
 1354  */
 1355 void
 1356 thread_create_intr(struct cpu *cp)
 1357 {
 1358         kthread_t *tp;
 1359 
 1360         tp = thread_create(NULL, 0,
 1361             (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
 1362 
 1363         /*
 1364          * Set the thread in the TS_FREE state.  The state will change
 1365          * to TS_ONPROC only while the interrupt is active.  Think of these
 1366          * as being on a private free list for the CPU.  Being TS_FREE keeps
 1367          * inactive interrupt threads out of debugger thread lists.
 1368          *
 1369          * We cannot call thread_create with TS_FREE because of the current
 1370          * checks there for ONPROC.  Fix this when thread_create takes flags.
 1371          */
 1372         THREAD_FREEINTR(tp, cp);
 1373 
 1374         /*
 1375          * Nobody should ever reference the credentials of an interrupt
 1376          * thread so make it NULL to catch any such references.
 1377          */
 1378         tp->t_cred = NULL;
 1379         tp->t_flag |= T_INTR_THREAD;
 1380         tp->t_cpu = cp;
 1381         tp->t_bound_cpu = cp;
 1382         tp->t_disp_queue = cp->cpu_disp;
 1383         tp->t_affinitycnt = 1;
 1384         tp->t_preempt = 1;
 1385 
 1386         /*
 1387          * Don't make a user-requested binding on this thread so that
 1388          * the processor can be offlined.
 1389          */
 1390         tp->t_bind_cpu = PBIND_NONE;    /* no USER-requested binding */
 1391         tp->t_bind_pset = PS_NONE;
 1392 
 1393 #if defined(__i386) || defined(__amd64)
 1394         tp->t_stk -= STACK_ALIGN;
 1395         *(tp->t_stk) = 0;               /* terminate intr thread stack */
 1396 #endif
 1397 
 1398         /*
 1399          * Link onto CPU's interrupt pool.
 1400          */
 1401         tp->t_link = cp->cpu_intr_thread;
 1402         cp->cpu_intr_thread = tp;
 1403 }
 1404 
 1405 /*
 1406  * TSD -- THREAD SPECIFIC DATA
 1407  */
 1408 static kmutex_t         tsd_mutex;       /* linked list spin lock */
 1409 static uint_t           tsd_nkeys;       /* size of destructor array */
 1410 /* per-key destructor funcs */
 1411 static void             (**tsd_destructor)(void *);
 1412 /* list of tsd_thread's */
 1413 static struct tsd_thread        *tsd_list;
 1414 
 1415 /*
 1416  * Default destructor
 1417  *      Needed because NULL destructor means that the key is unused
 1418  */
 1419 /* ARGSUSED */
 1420 void
 1421 tsd_defaultdestructor(void *value)
 1422 {}
 1423 
 1424 /*
 1425  * Create a key (index into per thread array)
 1426  *      Locks out tsd_create, tsd_destroy, and tsd_exit
 1427  *      May allocate memory with lock held
 1428  */
 1429 void
 1430 tsd_create(uint_t *keyp, void (*destructor)(void *))
 1431 {
 1432         int     i;
 1433         uint_t  nkeys;
 1434 
 1435         /*
 1436          * if key is allocated, do nothing
 1437          */
 1438         mutex_enter(&tsd_mutex);
 1439         if (*keyp) {
 1440                 mutex_exit(&tsd_mutex);
 1441                 return;
 1442         }
 1443         /*
 1444          * find an unused key
 1445          */
 1446         if (destructor == NULL)
 1447                 destructor = tsd_defaultdestructor;
 1448 
 1449         for (i = 0; i < tsd_nkeys; ++i)
 1450                 if (tsd_destructor[i] == NULL)
 1451                         break;
 1452 
 1453         /*
 1454          * if no unused keys, increase the size of the destructor array
 1455          */
 1456         if (i == tsd_nkeys) {
 1457                 if ((nkeys = (tsd_nkeys << 1)) == 0)
 1458                         nkeys = 1;
 1459                 tsd_destructor =
 1460                     (void (**)(void *))tsd_realloc((void *)tsd_destructor,
 1461                     (size_t)(tsd_nkeys * sizeof (void (*)(void *))),
 1462                     (size_t)(nkeys * sizeof (void (*)(void *))));
 1463                 tsd_nkeys = nkeys;
 1464         }
 1465 
 1466         /*
 1467          * allocate the next available unused key
 1468          */
 1469         tsd_destructor[i] = destructor;
 1470         *keyp = i + 1;
 1471         mutex_exit(&tsd_mutex);
 1472 }
 1473 
 1474 /*
 1475  * Destroy a key -- this is for unloadable modules
 1476  *
 1477  * Assumes that the caller is preventing tsd_set and tsd_get
 1478  * Locks out tsd_create, tsd_destroy, and tsd_exit
 1479  * May free memory with lock held
 1480  */
 1481 void
 1482 tsd_destroy(uint_t *keyp)
 1483 {
 1484         uint_t key;
 1485         struct tsd_thread *tsd;
 1486 
 1487         /*
 1488          * protect the key namespace and our destructor lists
 1489          */
 1490         mutex_enter(&tsd_mutex);
 1491         key = *keyp;
 1492         *keyp = 0;
 1493 
 1494         ASSERT(key <= tsd_nkeys);
 1495 
 1496         /*
 1497          * if the key is valid
 1498          */
 1499         if (key != 0) {
 1500                 uint_t k = key - 1;
 1501                 /*
 1502                  * for every thread with TSD, call key's destructor
 1503                  */
 1504                 for (tsd = tsd_list; tsd; tsd = tsd->ts_next) {
 1505                         /*
 1506                          * no TSD for key in this thread
 1507                          */
 1508                         if (key > tsd->ts_nkeys)
 1509                                 continue;
 1510                         /*
 1511                          * call destructor for key
 1512                          */
 1513                         if (tsd->ts_value[k] && tsd_destructor[k])
 1514                                 (*tsd_destructor[k])(tsd->ts_value[k]);
 1515                         /*
 1516                          * reset value for key
 1517                          */
 1518                         tsd->ts_value[k] = NULL;
 1519                 }
 1520                 /*
 1521                  * actually free the key (NULL destructor == unused)
 1522                  */
 1523                 tsd_destructor[k] = NULL;
 1524         }
 1525 
 1526         mutex_exit(&tsd_mutex);
 1527 }
 1528 
 1529 /*
 1530  * Quickly return the per thread value that was stored with the specified key
 1531  * Assumes the caller is protecting key from tsd_create and tsd_destroy
 1532  */
 1533 void *
 1534 tsd_get(uint_t key)
 1535 {
 1536         return (tsd_agent_get(curthread, key));
 1537 }
 1538 
 1539 /*
 1540  * Set a per thread value indexed with the specified key
 1541  */
 1542 int
 1543 tsd_set(uint_t key, void *value)
 1544 {
 1545         return (tsd_agent_set(curthread, key, value));
 1546 }
 1547 
 1548 /*
 1549  * Like tsd_get(), except that the agent lwp can get the tsd of
 1550  * another thread in the same process (the agent thread only runs when the
 1551  * process is completely stopped by /proc), or syslwp is creating a new lwp.
 1552  */
 1553 void *
 1554 tsd_agent_get(kthread_t *t, uint_t key)
 1555 {
 1556         struct tsd_thread *tsd = t->t_tsd;
 1557 
 1558         ASSERT(t == curthread ||
 1559             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
 1560 
 1561         if (key && tsd != NULL && key <= tsd->ts_nkeys)
 1562                 return (tsd->ts_value[key - 1]);
 1563         return (NULL);
 1564 }
 1565 
 1566 /*
 1567  * Like tsd_set(), except that the agent lwp can set the tsd of
 1568  * another thread in the same process, or syslwp can set the tsd
 1569  * of a thread it's in the middle of creating.
 1570  *
 1571  * Assumes the caller is protecting key from tsd_create and tsd_destroy
 1572  * May lock out tsd_destroy (and tsd_create), may allocate memory with
 1573  * lock held
 1574  */
 1575 int
 1576 tsd_agent_set(kthread_t *t, uint_t key, void *value)
 1577 {
 1578         struct tsd_thread *tsd = t->t_tsd;
 1579 
 1580         ASSERT(t == curthread ||
 1581             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
 1582 
 1583         if (key == 0)
 1584                 return (EINVAL);
 1585         if (tsd == NULL)
 1586                 tsd = t->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
 1587         if (key <= tsd->ts_nkeys) {
 1588                 tsd->ts_value[key - 1] = value;
 1589                 return (0);
 1590         }
 1591 
 1592         ASSERT(key <= tsd_nkeys);
 1593 
 1594         /*
 1595          * lock out tsd_destroy()
 1596          */
 1597         mutex_enter(&tsd_mutex);
 1598         if (tsd->ts_nkeys == 0) {
 1599                 /*
 1600                  * Link onto list of threads with TSD
 1601                  */
 1602                 if ((tsd->ts_next = tsd_list) != NULL)
 1603                         tsd_list->ts_prev = tsd;
 1604                 tsd_list = tsd;
 1605         }
 1606 
 1607         /*
 1608          * Allocate thread local storage and set the value for key
 1609          */
 1610         tsd->ts_value = tsd_realloc(tsd->ts_value,
 1611             tsd->ts_nkeys * sizeof (void *),
 1612             key * sizeof (void *));
 1613         tsd->ts_nkeys = key;
 1614         tsd->ts_value[key - 1] = value;
 1615         mutex_exit(&tsd_mutex);
 1616 
 1617         return (0);
 1618 }
 1619 
 1620 
 1621 /*
 1622  * Return the per thread value that was stored with the specified key
 1623  *      If necessary, create the key and the value
 1624  *      Assumes the caller is protecting *keyp from tsd_destroy
 1625  */
 1626 void *
 1627 tsd_getcreate(uint_t *keyp, void (*destroy)(void *), void *(*allocate)(void))
 1628 {
 1629         void *value;
 1630         uint_t key = *keyp;
 1631         struct tsd_thread *tsd = curthread->t_tsd;
 1632 
 1633         if (tsd == NULL)
 1634                 tsd = curthread->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
 1635         if (key && key <= tsd->ts_nkeys && (value = tsd->ts_value[key - 1]))
 1636                 return (value);
 1637         if (key == 0)
 1638                 tsd_create(keyp, destroy);
 1639         (void) tsd_set(*keyp, value = (*allocate)());
 1640 
 1641         return (value);
 1642 }
 1643 
 1644 /*
 1645  * Called from thread_exit() to run the destructor function for each tsd
 1646  *      Locks out tsd_create and tsd_destroy
 1647  *      Assumes that the destructor *DOES NOT* use tsd
 1648  */
 1649 void
 1650 tsd_exit(void)
 1651 {
 1652         int i;
 1653         struct tsd_thread *tsd = curthread->t_tsd;
 1654 
 1655         if (tsd == NULL)
 1656                 return;
 1657 
 1658         if (tsd->ts_nkeys == 0) {
 1659                 kmem_free(tsd, sizeof (*tsd));
 1660                 curthread->t_tsd = NULL;
 1661                 return;
 1662         }
 1663 
 1664         /*
 1665          * lock out tsd_create and tsd_destroy, call
 1666          * the destructor, and mark the value as destroyed.
 1667          */
 1668         mutex_enter(&tsd_mutex);
 1669 
 1670         for (i = 0; i < tsd->ts_nkeys; i++) {
 1671                 if (tsd->ts_value[i] && tsd_destructor[i])
 1672                         (*tsd_destructor[i])(tsd->ts_value[i]);
 1673                 tsd->ts_value[i] = NULL;
 1674         }
 1675 
 1676         /*
 1677          * remove from linked list of threads with TSD
 1678          */
 1679         if (tsd->ts_next)
 1680                 tsd->ts_next->ts_prev = tsd->ts_prev;
 1681         if (tsd->ts_prev)
 1682                 tsd->ts_prev->ts_next = tsd->ts_next;
 1683         if (tsd_list == tsd)
 1684                 tsd_list = tsd->ts_next;
 1685 
 1686         mutex_exit(&tsd_mutex);
 1687 
 1688         /*
 1689          * free up the TSD
 1690          */
 1691         kmem_free(tsd->ts_value, tsd->ts_nkeys * sizeof (void *));
 1692         kmem_free(tsd, sizeof (struct tsd_thread));
 1693         curthread->t_tsd = NULL;
 1694 }
 1695 
 1696 /*
 1697  * realloc
 1698  */
 1699 static void *
 1700 tsd_realloc(void *old, size_t osize, size_t nsize)
 1701 {
 1702         void *new;
 1703 
 1704         new = kmem_zalloc(nsize, KM_SLEEP);
 1705         if (old) {
 1706                 bcopy(old, new, osize);
 1707                 kmem_free(old, osize);
 1708         }
 1709         return (new);
 1710 }
 1711 
 1712 /*
 1713  * Return non-zero if an interrupt is being serviced.
 1714  */
 1715 int
 1716 servicing_interrupt()
 1717 {
 1718         int onintr = 0;
 1719 
 1720         /* Are we an interrupt thread */
 1721         if (curthread->t_flag & T_INTR_THREAD)
 1722                 return (1);
 1723         /* Are we servicing a high level interrupt? */
 1724         if (CPU_ON_INTR(CPU)) {
 1725                 kpreempt_disable();
 1726                 onintr = CPU_ON_INTR(CPU);
 1727                 kpreempt_enable();
 1728         }
 1729         return (onintr);
 1730 }
 1731 
 1732 
 1733 /*
 1734  * Change the dispatch priority of a thread in the system.
 1735  * Used when raising or lowering a thread's priority.
 1736  * (E.g., priority inheritance)
 1737  *
 1738  * Since threads are queued according to their priority, we
 1739  * we must check the thread's state to determine whether it
 1740  * is on a queue somewhere. If it is, we've got to:
 1741  *
 1742  *      o Dequeue the thread.
 1743  *      o Change its effective priority.
 1744  *      o Enqueue the thread.
 1745  *
 1746  * Assumptions: The thread whose priority we wish to change
 1747  * must be locked before we call thread_change_(e)pri().
 1748  * The thread_change(e)pri() function doesn't drop the thread
 1749  * lock--that must be done by its caller.
 1750  */
 1751 void
 1752 thread_change_epri(kthread_t *t, pri_t disp_pri)
 1753 {
 1754         uint_t  state;
 1755 
 1756         ASSERT(THREAD_LOCK_HELD(t));
 1757 
 1758         /*
 1759          * If the inherited priority hasn't actually changed,
 1760          * just return.
 1761          */
 1762         if (t->t_epri == disp_pri)
 1763                 return;
 1764 
 1765         state = t->t_state;
 1766 
 1767         /*
 1768          * If it's not on a queue, change the priority with impunity.
 1769          */
 1770         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
 1771                 t->t_epri = disp_pri;
 1772                 if (state == TS_ONPROC) {
 1773                         cpu_t *cp = t->t_disp_queue->disp_cpu;
 1774 
 1775                         if (t == cp->cpu_dispthread)
 1776                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
 1777                 }
 1778         } else if (state == TS_SLEEP) {
 1779                 /*
 1780                  * Take the thread out of its sleep queue.
 1781                  * Change the inherited priority.
 1782                  * Re-enqueue the thread.
 1783                  * Each synchronization object exports a function
 1784                  * to do this in an appropriate manner.
 1785                  */
 1786                 SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
 1787         } else if (state == TS_WAIT) {
 1788                 /*
 1789                  * Re-enqueue a thread on the wait queue if its
 1790                  * effective priority needs to change.
 1791                  */
 1792                 if (disp_pri != t->t_epri)
 1793                         waitq_change_pri(t, disp_pri);
 1794         } else {
 1795                 /*
 1796                  * The thread is on a run queue.
 1797                  * Note: setbackdq() may not put the thread
 1798                  * back on the same run queue where it originally
 1799                  * resided.
 1800                  */
 1801                 (void) dispdeq(t);
 1802                 t->t_epri = disp_pri;
 1803                 setbackdq(t);
 1804         }
 1805         schedctl_set_cidpri(t);
 1806 }
 1807 
 1808 /*
 1809  * Function: Change the t_pri field of a thread.
 1810  * Side Effects: Adjust the thread ordering on a run queue
 1811  *               or sleep queue, if necessary.
 1812  * Returns: 1 if the thread was on a run queue, else 0.
 1813  */
 1814 int
 1815 thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
 1816 {
 1817         uint_t  state;
 1818         int     on_rq = 0;
 1819 
 1820         ASSERT(THREAD_LOCK_HELD(t));
 1821 
 1822         state = t->t_state;
 1823         THREAD_WILLCHANGE_PRI(t, disp_pri);
 1824 
 1825         /*
 1826          * If it's not on a queue, change the priority with impunity.
 1827          */
 1828         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
 1829                 t->t_pri = disp_pri;
 1830 
 1831                 if (state == TS_ONPROC) {
 1832                         cpu_t *cp = t->t_disp_queue->disp_cpu;
 1833 
 1834                         if (t == cp->cpu_dispthread)
 1835                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
 1836                 }
 1837         } else if (state == TS_SLEEP) {
 1838                 /*
 1839                  * If the priority has changed, take the thread out of
 1840                  * its sleep queue and change the priority.
 1841                  * Re-enqueue the thread.
 1842                  * Each synchronization object exports a function
 1843                  * to do this in an appropriate manner.
 1844                  */
 1845                 if (disp_pri != t->t_pri)
 1846                         SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
 1847         } else if (state == TS_WAIT) {
 1848                 /*
 1849                  * Re-enqueue a thread on the wait queue if its
 1850                  * priority needs to change.
 1851                  */
 1852                 if (disp_pri != t->t_pri)
 1853                         waitq_change_pri(t, disp_pri);
 1854         } else {
 1855                 /*
 1856                  * The thread is on a run queue.
 1857                  * Note: setbackdq() may not put the thread
 1858                  * back on the same run queue where it originally
 1859                  * resided.
 1860                  *
 1861                  * We still requeue the thread even if the priority
 1862                  * is unchanged to preserve round-robin (and other)
 1863                  * effects between threads of the same priority.
 1864                  */
 1865                 on_rq = dispdeq(t);
 1866                 ASSERT(on_rq);
 1867                 t->t_pri = disp_pri;
 1868                 if (front) {
 1869                         setfrontdq(t);
 1870                 } else {
 1871                         setbackdq(t);
 1872                 }
 1873         }
 1874         schedctl_set_cidpri(t);
 1875         return (on_rq);
 1876 }
 1877 
 1878 /*
 1879  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
 1880  * specific pattern.
 1881  */
 1882 static void
 1883 stkinfo_begin(kthread_t *t)
 1884 {
 1885         caddr_t start;  /* stack start */
 1886         caddr_t end;    /* stack end  */
 1887         uint64_t *ptr;  /* pattern pointer */
 1888 
 1889         /*
 1890          * Stack grows up or down, see thread_create(),
 1891          * compute stack memory area start and end (start < end).
 1892          */
 1893         if (t->t_stk > t->t_stkbase) {
 1894                 /* stack grows down */
 1895                 start = t->t_stkbase;
 1896                 end = t->t_stk;
 1897         } else {
 1898                 /* stack grows up */
 1899                 start = t->t_stk;
 1900                 end = t->t_stkbase;
 1901         }
 1902 
 1903         /*
 1904          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
 1905          * alignement for start and end in stack area boundaries
 1906          * (protection against corrupt t_stkbase/t_stk data).
 1907          */
 1908         if ((((uintptr_t)start) & 0x7) != 0) {
 1909                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
 1910         }
 1911         end = (caddr_t)(((uintptr_t)end) & (~0x7));
 1912 
 1913         if ((end <= start) || (end - start) > (1024 * 1024)) {
 1914                 /* negative or stack size > 1 meg, assume bogus */
 1915                 return;
 1916         }
 1917 
 1918         /* fill stack area with a pattern (instead of zeros) */
 1919         ptr = (uint64_t *)((void *)start);
 1920         while (ptr < (uint64_t *)((void *)end)) {
 1921                 *ptr++ = KMEM_STKINFO_PATTERN;
 1922         }
 1923 }
 1924 
 1925 
 1926 /*
 1927  * Tunable kmem_stackinfo is set, create stackinfo log if doesn't already exist,
 1928  * compute the percentage of kernel stack really used, and set in the log
 1929  * if it's the latest highest percentage.
 1930  */
 1931 static void
 1932 stkinfo_end(kthread_t *t)
 1933 {
 1934         caddr_t start;  /* stack start */
 1935         caddr_t end;    /* stack end  */
 1936         uint64_t *ptr;  /* pattern pointer */
 1937         size_t stksz;   /* stack size */
 1938         size_t smallest = 0;
 1939         size_t percent = 0;
 1940         uint_t index = 0;
 1941         uint_t i;
 1942         static size_t smallest_percent = (size_t)-1;
 1943         static uint_t full = 0;
 1944 
 1945         /* create the stackinfo log, if doesn't already exist */
 1946         mutex_enter(&kmem_stkinfo_lock);
 1947         if (kmem_stkinfo_log == NULL) {
 1948                 kmem_stkinfo_log = (kmem_stkinfo_t *)
 1949                     kmem_zalloc(KMEM_STKINFO_LOG_SIZE *
 1950                     (sizeof (kmem_stkinfo_t)), KM_NOSLEEP);
 1951                 if (kmem_stkinfo_log == NULL) {
 1952                         mutex_exit(&kmem_stkinfo_lock);
 1953                         return;
 1954                 }
 1955         }
 1956         mutex_exit(&kmem_stkinfo_lock);
 1957 
 1958         /*
 1959          * Stack grows up or down, see thread_create(),
 1960          * compute stack memory area start and end (start < end).
 1961          */
 1962         if (t->t_stk > t->t_stkbase) {
 1963                 /* stack grows down */
 1964                 start = t->t_stkbase;
 1965                 end = t->t_stk;
 1966         } else {
 1967                 /* stack grows up */
 1968                 start = t->t_stk;
 1969                 end = t->t_stkbase;
 1970         }
 1971 
 1972         /* stack size as found in kthread_t */
 1973         stksz = end - start;
 1974 
 1975         /*
 1976          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
 1977          * alignement for start and end in stack area boundaries
 1978          * (protection against corrupt t_stkbase/t_stk data).
 1979          */
 1980         if ((((uintptr_t)start) & 0x7) != 0) {
 1981                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
 1982         }
 1983         end = (caddr_t)(((uintptr_t)end) & (~0x7));
 1984 
 1985         if ((end <= start) || (end - start) > (1024 * 1024)) {
 1986                 /* negative or stack size > 1 meg, assume bogus */
 1987                 return;
 1988         }
 1989 
 1990         /* search until no pattern in the stack */
 1991         if (t->t_stk > t->t_stkbase) {
 1992                 /* stack grows down */
 1993 #if defined(__i386) || defined(__amd64)
 1994                 /*
 1995                  * 6 longs are pushed on stack, see thread_load(). Skip
 1996                  * them, so if kthread has never run, percent is zero.
 1997                  * 8 bytes alignement is preserved for a 32 bit kernel,
 1998                  * 6 x 4 = 24, 24 is a multiple of 8.
 1999                  *
 2000                  */
 2001                 end -= (6 * sizeof (long));
 2002 #endif
 2003                 ptr = (uint64_t *)((void *)start);
 2004                 while (ptr < (uint64_t *)((void *)end)) {
 2005                         if (*ptr != KMEM_STKINFO_PATTERN) {
 2006                                 percent = stkinfo_percent(end,
 2007                                     start, (caddr_t)ptr);
 2008                                 break;
 2009                         }
 2010                         ptr++;
 2011                 }
 2012         } else {
 2013                 /* stack grows up */
 2014                 ptr = (uint64_t *)((void *)end);
 2015                 ptr--;
 2016                 while (ptr >= (uint64_t *)((void *)start)) {
 2017                         if (*ptr != KMEM_STKINFO_PATTERN) {
 2018                                 percent = stkinfo_percent(start,
 2019                                     end, (caddr_t)ptr);
 2020                                 break;
 2021                         }
 2022                         ptr--;
 2023                 }
 2024         }
 2025 
 2026         DTRACE_PROBE3(stack__usage, kthread_t *, t,
 2027             size_t, stksz, size_t, percent);
 2028 
 2029         if (percent == 0) {
 2030                 return;
 2031         }
 2032 
 2033         mutex_enter(&kmem_stkinfo_lock);
 2034         if (full == KMEM_STKINFO_LOG_SIZE && percent < smallest_percent) {
 2035                 /*
 2036                  * The log is full and already contains the highest values
 2037                  */
 2038                 mutex_exit(&kmem_stkinfo_lock);
 2039                 return;
 2040         }
 2041 
 2042         /* keep a log of the highest used stack */
 2043         for (i = 0; i < KMEM_STKINFO_LOG_SIZE; i++) {
 2044                 if (kmem_stkinfo_log[i].percent == 0) {
 2045                         index = i;
 2046                         full++;
 2047                         break;
 2048                 }
 2049                 if (smallest == 0) {
 2050                         smallest = kmem_stkinfo_log[i].percent;
 2051                         index = i;
 2052                         continue;
 2053                 }
 2054                 if (kmem_stkinfo_log[i].percent < smallest) {
 2055                         smallest = kmem_stkinfo_log[i].percent;
 2056                         index = i;
 2057                 }
 2058         }
 2059 
 2060         if (percent >= kmem_stkinfo_log[index].percent) {
 2061                 kmem_stkinfo_log[index].kthread = (caddr_t)t;
 2062                 kmem_stkinfo_log[index].t_startpc = (caddr_t)t->t_startpc;
 2063                 kmem_stkinfo_log[index].start = start;
 2064                 kmem_stkinfo_log[index].stksz = stksz;
 2065                 kmem_stkinfo_log[index].percent = percent;
 2066                 kmem_stkinfo_log[index].t_tid = t->t_tid;
 2067                 kmem_stkinfo_log[index].cmd[0] = '\0';
 2068                 if (t->t_tid != 0) {
 2069                         stksz = strlen((t->t_procp)->p_user.u_comm);
 2070                         if (stksz >= KMEM_STKINFO_STR_SIZE) {
 2071                                 stksz = KMEM_STKINFO_STR_SIZE - 1;
 2072                                 kmem_stkinfo_log[index].cmd[stksz] = '\0';
 2073                         } else {
 2074                                 stksz += 1;
 2075                         }
 2076                         (void) memcpy(kmem_stkinfo_log[index].cmd,
 2077                             (t->t_procp)->p_user.u_comm, stksz);
 2078                 }
 2079                 if (percent < smallest_percent) {
 2080                         smallest_percent = percent;
 2081                 }
 2082         }
 2083         mutex_exit(&kmem_stkinfo_lock);
 2084 }
 2085 
 2086 /*
 2087  * Tunable kmem_stackinfo is set, compute stack utilization percentage.
 2088  */
 2089 static size_t
 2090 stkinfo_percent(caddr_t t_stk, caddr_t t_stkbase, caddr_t sp)
 2091 {
 2092         size_t percent;
 2093         size_t s;
 2094 
 2095         if (t_stk > t_stkbase) {
 2096                 /* stack grows down */
 2097                 if (sp > t_stk) {
 2098                         return (0);
 2099                 }
 2100                 if (sp < t_stkbase) {
 2101                         return (100);
 2102                 }
 2103                 percent = t_stk - sp + 1;
 2104                 s = t_stk - t_stkbase + 1;
 2105         } else {
 2106                 /* stack grows up */
 2107                 if (sp < t_stk) {
 2108                         return (0);
 2109                 }
 2110                 if (sp > t_stkbase) {
 2111                         return (100);
 2112                 }
 2113                 percent = sp - t_stk + 1;
 2114                 s = t_stkbase - t_stk + 1;
 2115         }
 2116         percent = ((100 * percent) / s) + 1;
 2117         if (percent > 100) {
 2118                 percent = 100;
 2119         }
 2120         return (percent);
 2121 }

Cache object: 8a68ab2ad02a39b804920a095962a499


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.