sched_ule.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice unmodified, this list of conditions, and the following
   10  *    disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.121.2.14 2005/02/04 20:13:21 jeff Exp $");
   29 
   30 #include <opt_sched.h>
   31 
   32 #define kse td_sched
   33 
   34 #include <sys/param.h>
   35 #include <sys/systm.h>
   36 #include <sys/kdb.h>
   37 #include <sys/kernel.h>
   38 #include <sys/ktr.h>
   39 #include <sys/lock.h>
   40 #include <sys/mutex.h>
   41 #include <sys/proc.h>
   42 #include <sys/resource.h>
   43 #include <sys/resourcevar.h>
   44 #include <sys/sched.h>
   45 #include <sys/smp.h>
   46 #include <sys/sx.h>
   47 #include <sys/sysctl.h>
   48 #include <sys/sysproto.h>
   49 #include <sys/vmmeter.h>
   50 #ifdef KTRACE
   51 #include <sys/uio.h>
   52 #include <sys/ktrace.h>
   53 #endif
   54 
   55 #include <machine/cpu.h>
   56 #include <machine/smp.h>
   57 
   58 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
   59 /* XXX This is bogus compatability crap for ps */
   60 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
   61 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
   62 
   63 static void sched_setup(void *dummy);
   64 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
   65 
   66 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
   67 
   68 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
   69     "Scheduler name");
   70 
   71 static int slice_min = 1;
   72 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
   73 
   74 static int slice_max = 10;
   75 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
   76 
   77 int realstathz;
   78 int tickincr = 1;
   79 
   80 /*
   81  * The schedulable entity that can be given a context to run.
   82  * A process may have several of these. Probably one per processor
   83  * but posibly a few more. In this universe they are grouped
   84  * with a KSEG that contains the priority and niceness
   85  * for the group.
   86  */
   87 struct kse {
   88         TAILQ_ENTRY(kse) ke_procq;      /* (j/z) Run queue. */
   89         int             ke_flags;       /* (j) KEF_* flags. */
   90         struct thread   *ke_thread;     /* (*) Active associated thread. */
   91         fixpt_t         ke_pctcpu;      /* (j) %cpu during p_swtime. */
   92         char            ke_rqindex;     /* (j) Run queue index. */
   93         enum {
   94                 KES_THREAD = 0x0,       /* slaved to thread state */
   95                 KES_ONRUNQ
   96         } ke_state;                     /* (j) thread sched specific status. */
   97         int             ke_slptime;
   98         int             ke_slice;
   99         struct runq     *ke_runq;
  100         u_char          ke_cpu;         /* CPU that we have affinity for. */
  101         /* The following variables are only used for pctcpu calculation */
  102         int             ke_ltick;       /* Last tick that we were running on */
  103         int             ke_ftick;       /* First tick that we were running on */
  104         int             ke_ticks;       /* Tick count */
  105 
  106 };
  107 
  108 
  109 #define td_kse td_sched
  110 #define td_slptime              td_kse->ke_slptime
  111 #define ke_proc                 ke_thread->td_proc
  112 #define ke_ksegrp               ke_thread->td_ksegrp
  113 
  114 /* flags kept in ke_flags */
  115 #define KEF_SCHED0      0x00001 /* For scheduler-specific use. */
  116 #define KEF_SCHED1      0x00002 /* For scheduler-specific use. */
  117 #define KEF_SCHED2      0x00004 /* For scheduler-specific use. */
  118 #define KEF_SCHED3      0x00008 /* For scheduler-specific use. */
  119 #define KEF_SCHED4      0x00010 
  120 #define KEF_SCHED5      0x00020 
  121 #define KEF_DIDRUN      0x02000 /* Thread actually ran. */
  122 #define KEF_EXIT        0x04000 /* Thread is being killed. */
  123 
  124 /*
  125  * These datastructures are allocated within their parent datastructure but
  126  * are scheduler specific.
  127  */
  128 
  129 #define ke_assign       ke_procq.tqe_next
  130 
  131 #define KEF_ASSIGNED    KEF_SCHED0      /* Thread is being migrated. */
  132 #define KEF_BOUND       KEF_SCHED1      /* Thread can not migrate. */
  133 #define KEF_XFERABLE    KEF_SCHED2      /* Thread was added as transferable. */
  134 #define KEF_HOLD        KEF_SCHED3      /* Thread is temporarily bound. */
  135 #define KEF_REMOVED     KEF_SCHED4      /* Thread was removed while ASSIGNED */
  136 #define KEF_PRIOELEV    KEF_SCHED5      /* Thread has had its prio elevated. */
  137 
  138 struct kg_sched {
  139         struct thread   *skg_last_assigned; /* (j) Last thread assigned to */
  140                                            /* the system scheduler */
  141         int     skg_slptime;            /* Number of ticks we vol. slept */
  142         int     skg_runtime;            /* Number of ticks we were running */
  143         int     skg_avail_opennings;    /* (j) Num unfilled slots in group.*/
  144         int     skg_concurrency;        /* (j) Num threads requested in group.*/
  145 };
  146 #define kg_last_assigned        kg_sched->skg_last_assigned
  147 #define kg_avail_opennings      kg_sched->skg_avail_opennings
  148 #define kg_concurrency          kg_sched->skg_concurrency
  149 #define kg_runtime              kg_sched->skg_runtime
  150 #define kg_slptime              kg_sched->skg_slptime
  151 
  152 #define SLOT_RELEASE(kg)                                                \
  153 do {                                                                    \
  154         kg->kg_avail_opennings++;                                       \
  155         CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)",                \
  156         kg,                                                             \
  157         kg->kg_concurrency,                                             \
  158          kg->kg_avail_opennings);                                       \
  159         /*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency),       \
  160             ("slots out of whack")); */                                 \
  161 } while (0)
  162 
  163 #define SLOT_USE(kg)                                                    \
  164 do {                                                                    \
  165         kg->kg_avail_opennings--;                                       \
  166         CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)",                    \
  167         kg,                                                             \
  168         kg->kg_concurrency,                                             \
  169          kg->kg_avail_opennings);                                       \
  170         /*KASSERT((kg->kg_avail_opennings >= 0),                        \
  171             ("slots out of whack"));*/                                  \
  172 } while (0)
  173 
  174 static struct kse kse0;
  175 static struct kg_sched kg_sched0;
  176 
  177 /*
  178  * The priority is primarily determined by the interactivity score.  Thus, we
  179  * give lower(better) priorities to kse groups that use less CPU.  The nice
  180  * value is then directly added to this to allow nice to have some effect
  181  * on latency.
  182  *
  183  * PRI_RANGE:   Total priority range for timeshare threads.
  184  * PRI_NRESV:   Number of nice values.
  185  * PRI_BASE:    The start of the dynamic range.
  186  */
  187 #define SCHED_PRI_RANGE         (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
  188 #define SCHED_PRI_NRESV         ((PRIO_MAX - PRIO_MIN) + 1)
  189 #define SCHED_PRI_NHALF         (SCHED_PRI_NRESV / 2)
  190 #define SCHED_PRI_BASE          (PRI_MIN_TIMESHARE)
  191 #define SCHED_PRI_INTERACT(score)                                       \
  192     ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
  193 
  194 /*
  195  * These determine the interactivity of a process.
  196  *
  197  * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
  198  *              before throttling back.
  199  * SLP_RUN_FORK:        Maximum slp+run time to inherit at fork time.
  200  * INTERACT_MAX:        Maximum interactivity value.  Smaller is better.
  201  * INTERACT_THRESH:     Threshhold for placement on the current runq.
  202  */
  203 #define SCHED_SLP_RUN_MAX       ((hz * 5) << 10)
  204 #define SCHED_SLP_RUN_FORK      ((hz / 2) << 10)
  205 #define SCHED_INTERACT_MAX      (100)
  206 #define SCHED_INTERACT_HALF     (SCHED_INTERACT_MAX / 2)
  207 #define SCHED_INTERACT_THRESH   (30)
  208 
  209 /*
  210  * These parameters and macros determine the size of the time slice that is
  211  * granted to each thread.
  212  *
  213  * SLICE_MIN:   Minimum time slice granted, in units of ticks.
  214  * SLICE_MAX:   Maximum time slice granted.
  215  * SLICE_RANGE: Range of available time slices scaled by hz.
  216  * SLICE_SCALE: The number slices granted per val in the range of [0, max].
  217  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
  218  * SLICE_NTHRESH:       The nice cutoff point for slice assignment.
  219  */
  220 #define SCHED_SLICE_MIN                 (slice_min)
  221 #define SCHED_SLICE_MAX                 (slice_max)
  222 #define SCHED_SLICE_INTERACTIVE         (slice_max)
  223 #define SCHED_SLICE_NTHRESH     (SCHED_PRI_NHALF - 1)
  224 #define SCHED_SLICE_RANGE               (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
  225 #define SCHED_SLICE_SCALE(val, max)     (((val) * SCHED_SLICE_RANGE) / (max))
  226 #define SCHED_SLICE_NICE(nice)                                          \
  227     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
  228 
  229 /*
  230  * This macro determines whether or not the thread belongs on the current or
  231  * next run queue.
  232  */
  233 #define SCHED_INTERACTIVE(kg)                                           \
  234     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
  235 #define SCHED_CURR(kg, ke)                                              \
  236     ((ke->ke_flags & KEF_PRIOELEV) || SCHED_INTERACTIVE(kg))
  237 
  238 /*
  239  * Cpu percentage computation macros and defines.
  240  *
  241  * SCHED_CPU_TIME:      Number of seconds to average the cpu usage across.
  242  * SCHED_CPU_TICKS:     Number of hz ticks to average the cpu usage across.
  243  */
  244 
  245 #define SCHED_CPU_TIME  10
  246 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
  247 
  248 /*
  249  * kseq - per processor runqs and statistics.
  250  */
  251 struct kseq {
  252         struct runq     ksq_idle;               /* Queue of IDLE threads. */
  253         struct runq     ksq_timeshare[2];       /* Run queues for !IDLE. */
  254         struct runq     *ksq_next;              /* Next timeshare queue. */
  255         struct runq     *ksq_curr;              /* Current queue. */
  256         int             ksq_load_timeshare;     /* Load for timeshare. */
  257         int             ksq_load;               /* Aggregate load. */
  258         short           ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
  259         short           ksq_nicemin;            /* Least nice. */
  260 #ifdef SMP
  261         int                     ksq_transferable;
  262         LIST_ENTRY(kseq)        ksq_siblings;   /* Next in kseq group. */
  263         struct kseq_group       *ksq_group;     /* Our processor group. */
  264         volatile struct kse     *ksq_assigned;  /* assigned by another CPU. */
  265 #else
  266         int             ksq_sysload;            /* For loadavg, !ITHD load. */
  267 #endif
  268 };
  269 
  270 #ifdef SMP
  271 /*
  272  * kseq groups are groups of processors which can cheaply share threads.  When
  273  * one processor in the group goes idle it will check the runqs of the other
  274  * processors in its group prior to halting and waiting for an interrupt.
  275  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
  276  * In a numa environment we'd want an idle bitmap per group and a two tiered
  277  * load balancer.
  278  */
  279 struct kseq_group {
  280         int     ksg_cpus;               /* Count of CPUs in this kseq group. */
  281         cpumask_t ksg_cpumask;          /* Mask of cpus in this group. */
  282         cpumask_t ksg_idlemask;         /* Idle cpus in this group. */
  283         cpumask_t ksg_mask;             /* Bit mask for first cpu. */
  284         int     ksg_load;               /* Total load of this group. */
  285         int     ksg_transferable;       /* Transferable load of this group. */
  286         LIST_HEAD(, kseq)       ksg_members; /* Linked list of all members. */
  287 };
  288 #endif
  289 
  290 /*
  291  * One kse queue per processor.
  292  */
  293 #ifdef SMP
  294 static cpumask_t kseq_idle;
  295 static int ksg_maxid;
  296 static struct kseq      kseq_cpu[MAXCPU];
  297 static struct kseq_group kseq_groups[MAXCPU];
  298 static int bal_tick;
  299 static int gbal_tick;
  300 
  301 #define KSEQ_SELF()     (&kseq_cpu[PCPU_GET(cpuid)])
  302 #define KSEQ_CPU(x)     (&kseq_cpu[(x)])
  303 #define KSEQ_ID(x)      ((x) - kseq_cpu)
  304 #define KSEQ_GROUP(x)   (&kseq_groups[(x)])
  305 #else   /* !SMP */
  306 static struct kseq      kseq_cpu;
  307 
  308 #define KSEQ_SELF()     (&kseq_cpu)
  309 #define KSEQ_CPU(x)     (&kseq_cpu)
  310 #endif
  311 
  312 static void     slot_fill(struct ksegrp *kg);
  313 static struct kse *sched_choose(void);          /* XXX Should be thread * */
  314 static void sched_add_internal(struct thread *td, int preemptive);
  315 static void sched_slice(struct kse *ke);
  316 static void sched_priority(struct ksegrp *kg);
  317 static int sched_interact_score(struct ksegrp *kg);
  318 static void sched_interact_update(struct ksegrp *kg);
  319 static void sched_interact_fork(struct ksegrp *kg);
  320 static void sched_pctcpu_update(struct kse *ke);
  321 
  322 /* Operations on per processor queues */
  323 static struct kse * kseq_choose(struct kseq *kseq);
  324 static void kseq_setup(struct kseq *kseq);
  325 static void kseq_load_add(struct kseq *kseq, struct kse *ke);
  326 static void kseq_load_rem(struct kseq *kseq, struct kse *ke);
  327 static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke);
  328 static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke);
  329 static void kseq_nice_add(struct kseq *kseq, int nice);
  330 static void kseq_nice_rem(struct kseq *kseq, int nice);
  331 void kseq_print(int cpu);
  332 #ifdef SMP
  333 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
  334 static struct kse *runq_steal(struct runq *rq);
  335 static void sched_balance(void);
  336 static void sched_balance_groups(void);
  337 static void sched_balance_group(struct kseq_group *ksg);
  338 static void sched_balance_pair(struct kseq *high, struct kseq *low);
  339 static void kseq_move(struct kseq *from, int cpu);
  340 static int kseq_idled(struct kseq *kseq);
  341 static void kseq_notify(struct kse *ke, int cpu);
  342 static void kseq_assign(struct kseq *);
  343 static struct kse *kseq_steal(struct kseq *kseq, int stealidle);
  344 /*
  345  * On P4 Xeons the round-robin interrupt delivery is broken.  As a result of
  346  * this, we can't pin interrupts to the cpu that they were delivered to, 
  347  * otherwise all ithreads only run on CPU 0.
  348  */
  349 #ifdef __i386__
  350 #define KSE_CAN_MIGRATE(ke, class)                                      \
  351     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
  352 #else /* !__i386__ */
  353 #define KSE_CAN_MIGRATE(ke, class)                                      \
  354     ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 &&          \
  355     ((ke)->ke_flags & KEF_BOUND) == 0)
  356 #endif /* !__i386__ */
  357 #endif
  358 
  359 void
  360 kseq_print(int cpu)
  361 {
  362         struct kseq *kseq;
  363         int i;
  364 
  365         kseq = KSEQ_CPU(cpu);
  366 
  367         printf("kseq:\n");
  368         printf("\tload:           %d\n", kseq->ksq_load);
  369         printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
  370 #ifdef SMP
  371         printf("\tload transferable: %d\n", kseq->ksq_transferable);
  372 #endif
  373         printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
  374         printf("\tnice counts:\n");
  375         for (i = 0; i < SCHED_PRI_NRESV; i++)
  376                 if (kseq->ksq_nice[i])
  377                         printf("\t\t%d = %d\n",
  378                             i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
  379 }
  380 
  381 static __inline void
  382 kseq_runq_add(struct kseq *kseq, struct kse *ke)
  383 {
  384 #ifdef SMP
  385         if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) {
  386                 kseq->ksq_transferable++;
  387                 kseq->ksq_group->ksg_transferable++;
  388                 ke->ke_flags |= KEF_XFERABLE;
  389         }
  390 #endif
  391         runq_add(ke->ke_runq, ke, 0);
  392 }
  393 
  394 static __inline void
  395 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
  396 {
  397 #ifdef SMP
  398         if (ke->ke_flags & KEF_XFERABLE) {
  399                 kseq->ksq_transferable--;
  400                 kseq->ksq_group->ksg_transferable--;
  401                 ke->ke_flags &= ~KEF_XFERABLE;
  402         }
  403 #endif
  404         runq_remove(ke->ke_runq, ke);
  405 }
  406 
  407 static void
  408 kseq_load_add(struct kseq *kseq, struct kse *ke)
  409 {
  410         int class;
  411         mtx_assert(&sched_lock, MA_OWNED);
  412         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
  413         if (class == PRI_TIMESHARE)
  414                 kseq->ksq_load_timeshare++;
  415         kseq->ksq_load++;
  416         CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
  417         if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
  418 #ifdef SMP
  419                 kseq->ksq_group->ksg_load++;
  420 #else
  421                 kseq->ksq_sysload++;
  422 #endif
  423         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
  424                 kseq_nice_add(kseq, ke->ke_proc->p_nice);
  425 }
  426 
  427 static void
  428 kseq_load_rem(struct kseq *kseq, struct kse *ke)
  429 {
  430         int class;
  431         mtx_assert(&sched_lock, MA_OWNED);
  432         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
  433         if (class == PRI_TIMESHARE)
  434                 kseq->ksq_load_timeshare--;
  435         if (class != PRI_ITHD  && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
  436 #ifdef SMP
  437                 kseq->ksq_group->ksg_load--;
  438 #else
  439                 kseq->ksq_sysload--;
  440 #endif
  441         kseq->ksq_load--;
  442         CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
  443         ke->ke_runq = NULL;
  444         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
  445                 kseq_nice_rem(kseq, ke->ke_proc->p_nice);
  446 }
  447 
  448 static void
  449 kseq_nice_add(struct kseq *kseq, int nice)
  450 {
  451         mtx_assert(&sched_lock, MA_OWNED);
  452         /* Normalize to zero. */
  453         kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
  454         if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
  455                 kseq->ksq_nicemin = nice;
  456 }
  457 
  458 static void
  459 kseq_nice_rem(struct kseq *kseq, int nice) 
  460 {
  461         int n;
  462 
  463         mtx_assert(&sched_lock, MA_OWNED);
  464         /* Normalize to zero. */
  465         n = nice + SCHED_PRI_NHALF;
  466         kseq->ksq_nice[n]--;
  467         KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
  468 
  469         /*
  470          * If this wasn't the smallest nice value or there are more in
  471          * this bucket we can just return.  Otherwise we have to recalculate
  472          * the smallest nice.
  473          */
  474         if (nice != kseq->ksq_nicemin ||
  475             kseq->ksq_nice[n] != 0 ||
  476             kseq->ksq_load_timeshare == 0)
  477                 return;
  478 
  479         for (; n < SCHED_PRI_NRESV; n++)
  480                 if (kseq->ksq_nice[n]) {
  481                         kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
  482                         return;
  483                 }
  484 }
  485 
  486 #ifdef SMP
  487 /*
  488  * sched_balance is a simple CPU load balancing algorithm.  It operates by
  489  * finding the least loaded and most loaded cpu and equalizing their load
  490  * by migrating some processes.
  491  *
  492  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
  493  * installations will only have 2 cpus.  Secondly, load balancing too much at
  494  * once can have an unpleasant effect on the system.  The scheduler rarely has
  495  * enough information to make perfect decisions.  So this algorithm chooses
  496  * algorithm simplicity and more gradual effects on load in larger systems.
  497  *
  498  * It could be improved by considering the priorities and slices assigned to
  499  * each task prior to balancing them.  There are many pathological cases with
  500  * any approach and so the semi random algorithm below may work as well as any.
  501  *
  502  */
  503 static void
  504 sched_balance(void)
  505 {
  506         struct kseq_group *high;
  507         struct kseq_group *low;
  508         struct kseq_group *ksg;
  509         int cnt;
  510         int i;
  511 
  512         if (smp_started == 0)
  513                 goto out;
  514         low = high = NULL;
  515         i = random() % (ksg_maxid + 1);
  516         for (cnt = 0; cnt <= ksg_maxid; cnt++) {
  517                 ksg = KSEQ_GROUP(i);
  518                 /*
  519                  * Find the CPU with the highest load that has some
  520                  * threads to transfer.
  521                  */
  522                 if ((high == NULL || ksg->ksg_load > high->ksg_load)
  523                     && ksg->ksg_transferable)
  524                         high = ksg;
  525                 if (low == NULL || ksg->ksg_load < low->ksg_load)
  526                         low = ksg;
  527                 if (++i > ksg_maxid)
  528                         i = 0;
  529         }
  530         if (low != NULL && high != NULL && high != low)
  531                 sched_balance_pair(LIST_FIRST(&high->ksg_members),
  532                     LIST_FIRST(&low->ksg_members));
  533 out:
  534         bal_tick = ticks + (random() % (hz * 2));
  535 }
  536 
  537 static void
  538 sched_balance_groups(void)
  539 {
  540         int i;
  541 
  542         mtx_assert(&sched_lock, MA_OWNED);
  543         if (smp_started)
  544                 for (i = 0; i <= ksg_maxid; i++)
  545                         sched_balance_group(KSEQ_GROUP(i));
  546         gbal_tick = ticks + (random() % (hz * 2));
  547 }
  548 
  549 static void
  550 sched_balance_group(struct kseq_group *ksg)
  551 {
  552         struct kseq *kseq;
  553         struct kseq *high;
  554         struct kseq *low;
  555         int load;
  556 
  557         if (ksg->ksg_transferable == 0)
  558                 return;
  559         low = NULL;
  560         high = NULL;
  561         LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
  562                 load = kseq->ksq_load;
  563                 if (high == NULL || load > high->ksq_load)
  564                         high = kseq;
  565                 if (low == NULL || load < low->ksq_load)
  566                         low = kseq;
  567         }
  568         if (high != NULL && low != NULL && high != low)
  569                 sched_balance_pair(high, low);
  570 }
  571 
  572 static void
  573 sched_balance_pair(struct kseq *high, struct kseq *low)
  574 {
  575         int transferable;
  576         int high_load;
  577         int low_load;
  578         int move;
  579         int diff;
  580         int i;
  581 
  582         /*
  583          * If we're transfering within a group we have to use this specific
  584          * kseq's transferable count, otherwise we can steal from other members
  585          * of the group.
  586          */
  587         if (high->ksq_group == low->ksq_group) {
  588                 transferable = high->ksq_transferable;
  589                 high_load = high->ksq_load;
  590                 low_load = low->ksq_load;
  591         } else {
  592                 transferable = high->ksq_group->ksg_transferable;
  593                 high_load = high->ksq_group->ksg_load;
  594                 low_load = low->ksq_group->ksg_load;
  595         }
  596         if (transferable == 0)
  597                 return;
  598         /*
  599          * Determine what the imbalance is and then adjust that to how many
  600          * kses we actually have to give up (transferable).
  601          */
  602         diff = high_load - low_load;
  603         move = diff / 2;
  604         if (diff & 0x1)
  605                 move++;
  606         move = min(move, transferable);
  607         for (i = 0; i < move; i++)
  608                 kseq_move(high, KSEQ_ID(low));
  609         return;
  610 }
  611 
  612 static void
  613 kseq_move(struct kseq *from, int cpu)
  614 {
  615         struct kseq *kseq;
  616         struct kseq *to;
  617         struct kse *ke;
  618 
  619         kseq = from;
  620         to = KSEQ_CPU(cpu);
  621         ke = kseq_steal(kseq, 1);
  622         if (ke == NULL) {
  623                 struct kseq_group *ksg;
  624 
  625                 ksg = kseq->ksq_group;
  626                 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
  627                         if (kseq == from || kseq->ksq_transferable == 0)
  628                                 continue;
  629                         ke = kseq_steal(kseq, 1);
  630                         break;
  631                 }
  632                 if (ke == NULL)
  633                         panic("kseq_move: No KSEs available with a "
  634                             "transferable count of %d\n", 
  635                             ksg->ksg_transferable);
  636         }
  637         if (kseq == to)
  638                 return;
  639         ke->ke_state = KES_THREAD;
  640         kseq_runq_rem(kseq, ke);
  641         kseq_load_rem(kseq, ke);
  642         kseq_notify(ke, cpu);
  643 }
  644 
  645 static int
  646 kseq_idled(struct kseq *kseq)
  647 {
  648         struct kseq_group *ksg;
  649         struct kseq *steal;
  650         struct kse *ke;
  651 
  652         ksg = kseq->ksq_group;
  653         /*
  654          * If we're in a cpu group, try and steal kses from another cpu in
  655          * the group before idling.
  656          */
  657         if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
  658                 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
  659                         if (steal == kseq || steal->ksq_transferable == 0)
  660                                 continue;
  661                         ke = kseq_steal(steal, 0);
  662                         if (ke == NULL)
  663                                 continue;
  664                         ke->ke_state = KES_THREAD;
  665                         kseq_runq_rem(steal, ke);
  666                         kseq_load_rem(steal, ke);
  667                         ke->ke_cpu = PCPU_GET(cpuid);
  668                         sched_add_internal(ke->ke_thread, 0);
  669                         return (0);
  670                 }
  671         }
  672         /*
  673          * We only set the idled bit when all of the cpus in the group are
  674          * idle.  Otherwise we could get into a situation where a KSE bounces
  675          * back and forth between two idle cores on seperate physical CPUs.
  676          */
  677         ksg->ksg_idlemask |= PCPU_GET(cpumask);
  678         if (ksg->ksg_idlemask != ksg->ksg_cpumask)
  679                 return (1);
  680         atomic_set_int(&kseq_idle, ksg->ksg_mask);
  681         return (1);
  682 }
  683 
  684 static void
  685 kseq_assign(struct kseq *kseq)
  686 {
  687         struct kse *nke;
  688         struct kse *ke;
  689 
  690         do {
  691                 *(volatile struct kse **)&ke = kseq->ksq_assigned;
  692         } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
  693         for (; ke != NULL; ke = nke) {
  694                 nke = ke->ke_assign;
  695                 ke->ke_flags &= ~KEF_ASSIGNED;
  696                 SLOT_RELEASE(ke->ke_thread->td_ksegrp);
  697                 sched_add_internal(ke->ke_thread, 0);
  698         }
  699 }
  700 
  701 static void
  702 kseq_notify(struct kse *ke, int cpu)
  703 {
  704         struct kseq *kseq;
  705         struct thread *td;
  706         struct pcpu *pcpu;
  707         int prio;
  708 
  709         ke->ke_cpu = cpu;
  710         ke->ke_flags |= KEF_ASSIGNED;
  711         SLOT_USE(ke->ke_thread->td_ksegrp);
  712         prio = ke->ke_thread->td_priority;
  713 
  714         kseq = KSEQ_CPU(cpu);
  715 
  716         /*
  717          * Place a KSE on another cpu's queue and force a resched.
  718          */
  719         do {
  720                 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
  721         } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
  722         /*
  723          * Without sched_lock we could lose a race where we set NEEDRESCHED
  724          * on a thread that is switched out before the IPI is delivered.  This
  725          * would lead us to miss the resched.  This will be a problem once
  726          * sched_lock is pushed down.
  727          */
  728         pcpu = pcpu_find(cpu);
  729         td = pcpu->pc_curthread;
  730         if (ke->ke_thread->td_priority < td->td_priority ||
  731             td == pcpu->pc_idlethread) {
  732                 td->td_flags |= TDF_NEEDRESCHED;
  733                 ipi_selected(1 << cpu, IPI_AST);
  734         }
  735 }
  736 
  737 static struct kse *
  738 runq_steal(struct runq *rq)
  739 {
  740         struct rqhead *rqh;
  741         struct rqbits *rqb;
  742         struct kse *ke;
  743         int word;
  744         int bit;
  745 
  746         mtx_assert(&sched_lock, MA_OWNED);
  747         rqb = &rq->rq_status;
  748         for (word = 0; word < RQB_LEN; word++) {
  749                 if (rqb->rqb_bits[word] == 0)
  750                         continue;
  751                 for (bit = 0; bit < RQB_BPW; bit++) {
  752                         if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
  753                                 continue;
  754                         rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
  755                         TAILQ_FOREACH(ke, rqh, ke_procq) {
  756                                 if (KSE_CAN_MIGRATE(ke,
  757                                     PRI_BASE(ke->ke_ksegrp->kg_pri_class)))
  758                                         return (ke);
  759                         }
  760                 }
  761         }
  762         return (NULL);
  763 }
  764 
  765 static struct kse *
  766 kseq_steal(struct kseq *kseq, int stealidle)
  767 {
  768         struct kse *ke;
  769 
  770         /*
  771          * Steal from next first to try to get a non-interactive task that
  772          * may not have run for a while.
  773          */
  774         if ((ke = runq_steal(kseq->ksq_next)) != NULL)
  775                 return (ke);
  776         if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
  777                 return (ke);
  778         if (stealidle)
  779                 return (runq_steal(&kseq->ksq_idle));
  780         return (NULL);
  781 }
  782 
  783 int
  784 kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
  785 {
  786         struct kseq_group *ksg;
  787         int cpu;
  788 
  789         if (smp_started == 0)
  790                 return (0);
  791         cpu = 0;
  792         /*
  793          * If our load exceeds a certain threshold we should attempt to
  794          * reassign this thread.  The first candidate is the cpu that
  795          * originally ran the thread.  If it is idle, assign it there, 
  796          * otherwise, pick an idle cpu.
  797          *
  798          * The threshold at which we start to reassign kses has a large impact
  799          * on the overall performance of the system.  Tuned too high and
  800          * some CPUs may idle.  Too low and there will be excess migration
  801          * and context switches.
  802          */
  803         ksg = kseq->ksq_group;
  804         if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) {
  805                 ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group;
  806                 if (kseq_idle & ksg->ksg_mask) {
  807                         cpu = ffs(ksg->ksg_idlemask);
  808                         if (cpu)
  809                                 goto migrate;
  810                 }
  811                 /*
  812                  * Multiple cpus could find this bit simultaneously
  813                  * but the race shouldn't be terrible.
  814                  */
  815                 cpu = ffs(kseq_idle);
  816                 if (cpu)
  817                         goto migrate;
  818         }
  819         /*
  820          * If another cpu in this group has idled, assign a thread over
  821          * to them after checking to see if there are idled groups.
  822          */
  823         ksg = kseq->ksq_group;
  824         if (ksg->ksg_idlemask) {
  825                 cpu = ffs(ksg->ksg_idlemask);
  826                 if (cpu)
  827                         goto migrate;
  828         }
  829         /*
  830          * No new CPU was found.
  831          */
  832         return (0);
  833 migrate:
  834         /*
  835          * Now that we've found an idle CPU, migrate the thread.
  836          */
  837         cpu--;
  838         ke->ke_runq = NULL;
  839         kseq_notify(ke, cpu);
  840 
  841         return (1);
  842 }
  843 
  844 #endif  /* SMP */
  845 
  846 /*
  847  * Pick the highest priority task we have and return it.
  848  */
  849 
  850 static struct kse *
  851 kseq_choose(struct kseq *kseq)
  852 {
  853         struct runq *swap;
  854         struct kse *ke;
  855         int nice;
  856 
  857         mtx_assert(&sched_lock, MA_OWNED);
  858         swap = NULL;
  859 
  860         for (;;) {
  861                 ke = runq_choose(kseq->ksq_curr);
  862                 if (ke == NULL) {
  863                         /*
  864                          * We already swapped once and didn't get anywhere.
  865                          */
  866                         if (swap)
  867                                 break;
  868                         swap = kseq->ksq_curr;
  869                         kseq->ksq_curr = kseq->ksq_next;
  870                         kseq->ksq_next = swap;
  871                         continue;
  872                 }
  873                 /*
  874                  * If we encounter a slice of 0 the kse is in a
  875                  * TIMESHARE kse group and its nice was too far out
  876                  * of the range that receives slices. 
  877                  */
  878                 nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin);
  879                 if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
  880                     ke->ke_proc->p_nice != 0)) {
  881                         runq_remove(ke->ke_runq, ke);
  882                         sched_slice(ke);
  883                         ke->ke_runq = kseq->ksq_next;
  884                         runq_add(ke->ke_runq, ke, 0);
  885                         continue;
  886                 }
  887                 return (ke);
  888         }
  889 
  890         return (runq_choose(&kseq->ksq_idle));
  891 }
  892 
  893 static void
  894 kseq_setup(struct kseq *kseq)
  895 {
  896         runq_init(&kseq->ksq_timeshare[0]);
  897         runq_init(&kseq->ksq_timeshare[1]);
  898         runq_init(&kseq->ksq_idle);
  899         kseq->ksq_curr = &kseq->ksq_timeshare[0];
  900         kseq->ksq_next = &kseq->ksq_timeshare[1];
  901         kseq->ksq_load = 0;
  902         kseq->ksq_load_timeshare = 0;
  903 }
  904 
  905 static void
  906 sched_setup(void *dummy)
  907 {
  908 #ifdef SMP
  909         int balance_groups;
  910         int i;
  911 #endif
  912 
  913         slice_min = (hz/100);   /* 10ms */
  914         slice_max = (hz/7);     /* ~140ms */
  915 
  916 #ifdef SMP
  917         balance_groups = 0;
  918         /*
  919          * Initialize the kseqs.
  920          */
  921         for (i = 0; i < MAXCPU; i++) {
  922                 struct kseq *ksq;
  923 
  924                 ksq = &kseq_cpu[i];
  925                 ksq->ksq_assigned = NULL;
  926                 kseq_setup(&kseq_cpu[i]);
  927         }
  928         if (smp_topology == NULL) {
  929                 struct kseq_group *ksg;
  930                 struct kseq *ksq;
  931 
  932                 for (i = 0; i < MAXCPU; i++) {
  933                         ksq = &kseq_cpu[i];
  934                         ksg = &kseq_groups[i];
  935                         /*
  936                          * Setup a kseq group with one member.
  937                          */
  938                         ksq->ksq_transferable = 0;
  939                         ksq->ksq_group = ksg;
  940                         ksg->ksg_cpus = 1;
  941                         ksg->ksg_idlemask = 0;
  942                         ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
  943                         ksg->ksg_load = 0;
  944                         ksg->ksg_transferable = 0;
  945                         LIST_INIT(&ksg->ksg_members);
  946                         LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
  947                 }
  948         } else {
  949                 struct kseq_group *ksg;
  950                 struct cpu_group *cg;
  951                 int j;
  952 
  953                 for (i = 0; i < smp_topology->ct_count; i++) {
  954                         cg = &smp_topology->ct_group[i];
  955                         ksg = &kseq_groups[i];
  956                         /*
  957                          * Initialize the group.
  958                          */
  959                         ksg->ksg_idlemask = 0;
  960                         ksg->ksg_load = 0;
  961                         ksg->ksg_transferable = 0;
  962                         ksg->ksg_cpus = cg->cg_count;
  963                         ksg->ksg_cpumask = cg->cg_mask;
  964                         LIST_INIT(&ksg->ksg_members);
  965                         /*
  966                          * Find all of the group members and add them.
  967                          */
  968                         for (j = 0; j < MAXCPU; j++) {
  969                                 if ((cg->cg_mask & (1 << j)) != 0) {
  970                                         if (ksg->ksg_mask == 0)
  971                                                 ksg->ksg_mask = 1 << j;
  972                                         kseq_cpu[j].ksq_transferable = 0;
  973                                         kseq_cpu[j].ksq_group = ksg;
  974                                         LIST_INSERT_HEAD(&ksg->ksg_members,
  975                                             &kseq_cpu[j], ksq_siblings);
  976                                 }
  977                         }
  978                         if (ksg->ksg_cpus > 1)
  979                                 balance_groups = 1;
  980                 }
  981                 ksg_maxid = smp_topology->ct_count - 1;
  982         }
  983         /*
  984          * Stagger the group and global load balancer so they do not
  985          * interfere with each other.
  986          */
  987         bal_tick = ticks + hz;
  988         if (balance_groups)
  989                 gbal_tick = ticks + (hz / 2);
  990 #else
  991         kseq_setup(KSEQ_SELF());
  992 #endif
  993         mtx_lock_spin(&sched_lock);
  994         kseq_load_add(KSEQ_SELF(), &kse0);
  995         mtx_unlock_spin(&sched_lock);
  996 }
  997 
  998 /*
  999  * Scale the scheduling priority according to the "interactivity" of this
 1000  * process.
 1001  */
 1002 static void
 1003 sched_priority(struct ksegrp *kg)
 1004 {
 1005         int pri;
 1006 
 1007         if (kg->kg_pri_class != PRI_TIMESHARE)
 1008                 return;
 1009 
 1010         pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
 1011         pri += SCHED_PRI_BASE;
 1012         pri += kg->kg_proc->p_nice;
 1013 
 1014         if (pri > PRI_MAX_TIMESHARE)
 1015                 pri = PRI_MAX_TIMESHARE;
 1016         else if (pri < PRI_MIN_TIMESHARE)
 1017                 pri = PRI_MIN_TIMESHARE;
 1018 
 1019         kg->kg_user_pri = pri;
 1020 
 1021         return;
 1022 }
 1023 
 1024 /*
 1025  * Calculate a time slice based on the properties of the kseg and the runq
 1026  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
 1027  */
 1028 static void
 1029 sched_slice(struct kse *ke)
 1030 {
 1031         struct kseq *kseq;
 1032         struct ksegrp *kg;
 1033 
 1034         kg = ke->ke_ksegrp;
 1035         kseq = KSEQ_CPU(ke->ke_cpu);
 1036 
 1037         if (ke->ke_flags & KEF_PRIOELEV) {
 1038                 ke->ke_slice = SCHED_SLICE_MIN;
 1039                 return;
 1040         }
 1041 
 1042         /*
 1043          * Rationale:
 1044          * KSEs in interactive ksegs get a minimal slice so that we
 1045          * quickly notice if it abuses its advantage.
 1046          *
 1047          * KSEs in non-interactive ksegs are assigned a slice that is
 1048          * based on the ksegs nice value relative to the least nice kseg
 1049          * on the run queue for this cpu.
 1050          *
 1051          * If the KSE is less nice than all others it gets the maximum
 1052          * slice and other KSEs will adjust their slice relative to
 1053          * this when they first expire.
 1054          *
 1055          * There is 20 point window that starts relative to the least
 1056          * nice kse on the run queue.  Slice size is determined by
 1057          * the kse distance from the last nice ksegrp.
 1058          *
 1059          * If the kse is outside of the window it will get no slice
 1060          * and will be reevaluated each time it is selected on the
 1061          * run queue.  The exception to this is nice 0 ksegs when
 1062          * a nice -20 is running.  They are always granted a minimum
 1063          * slice.
 1064          */
 1065         if (!SCHED_INTERACTIVE(kg)) {
 1066                 int nice;
 1067 
 1068                 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
 1069                 if (kseq->ksq_load_timeshare == 0 ||
 1070                     kg->kg_proc->p_nice < kseq->ksq_nicemin)
 1071                         ke->ke_slice = SCHED_SLICE_MAX;
 1072                 else if (nice <= SCHED_SLICE_NTHRESH)
 1073                         ke->ke_slice = SCHED_SLICE_NICE(nice);
 1074                 else if (kg->kg_proc->p_nice == 0)
 1075                         ke->ke_slice = SCHED_SLICE_MIN;
 1076                 else
 1077                         ke->ke_slice = 0;
 1078         } else
 1079                 ke->ke_slice = SCHED_SLICE_INTERACTIVE;
 1080 
 1081         return;
 1082 }
 1083 
 1084 /*
 1085  * This routine enforces a maximum limit on the amount of scheduling history
 1086  * kept.  It is called after either the slptime or runtime is adjusted.
 1087  * This routine will not operate correctly when slp or run times have been
 1088  * adjusted to more than double their maximum.
 1089  */
 1090 static void
 1091 sched_interact_update(struct ksegrp *kg)
 1092 {
 1093         int sum;
 1094 
 1095         sum = kg->kg_runtime + kg->kg_slptime;
 1096         if (sum < SCHED_SLP_RUN_MAX)
 1097                 return;
 1098         /*
 1099          * If we have exceeded by more than 1/5th then the algorithm below
 1100          * will not bring us back into range.  Dividing by two here forces
 1101          * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 1102          */
 1103         if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 1104                 kg->kg_runtime /= 2;
 1105                 kg->kg_slptime /= 2;
 1106                 return;
 1107         }
 1108         kg->kg_runtime = (kg->kg_runtime / 5) * 4;
 1109         kg->kg_slptime = (kg->kg_slptime / 5) * 4;
 1110 }
 1111 
 1112 static void
 1113 sched_interact_fork(struct ksegrp *kg)
 1114 {
 1115         int ratio;
 1116         int sum;
 1117 
 1118         sum = kg->kg_runtime + kg->kg_slptime;
 1119         if (sum > SCHED_SLP_RUN_FORK) {
 1120                 ratio = sum / SCHED_SLP_RUN_FORK;
 1121                 kg->kg_runtime /= ratio;
 1122                 kg->kg_slptime /= ratio;
 1123         }
 1124 }
 1125 
 1126 static int
 1127 sched_interact_score(struct ksegrp *kg)
 1128 {
 1129         int div;
 1130 
 1131         if (kg->kg_runtime > kg->kg_slptime) {
 1132                 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
 1133                 return (SCHED_INTERACT_HALF +
 1134                     (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
 1135         } if (kg->kg_slptime > kg->kg_runtime) {
 1136                 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
 1137                 return (kg->kg_runtime / div);
 1138         }
 1139 
 1140         /*
 1141          * This can happen if slptime and runtime are 0.
 1142          */
 1143         return (0);
 1144 
 1145 }
 1146 
 1147 /*
 1148  * Very early in the boot some setup of scheduler-specific
 1149  * parts of proc0 and of soem scheduler resources needs to be done.
 1150  * Called from:
 1151  *  proc0_init()
 1152  */
 1153 void
 1154 schedinit(void)
 1155 {
 1156         /*
 1157          * Set up the scheduler specific parts of proc0.
 1158          */
 1159         proc0.p_sched = NULL; /* XXX */
 1160         ksegrp0.kg_sched = &kg_sched0;
 1161         thread0.td_sched = &kse0;
 1162         kse0.ke_thread = &thread0;
 1163         kse0.ke_state = KES_THREAD;
 1164         kg_sched0.skg_concurrency = 1;
 1165         kg_sched0.skg_avail_opennings = 0; /* we are already running */
 1166 }
 1167 
 1168 /*
 1169  * This is only somewhat accurate since given many processes of the same
 1170  * priority they will switch when their slices run out, which will be
 1171  * at most SCHED_SLICE_MAX.
 1172  */
 1173 int
 1174 sched_rr_interval(void)
 1175 {
 1176         return (SCHED_SLICE_MAX);
 1177 }
 1178 
 1179 static void
 1180 sched_pctcpu_update(struct kse *ke)
 1181 {
 1182         /*
 1183          * Adjust counters and watermark for pctcpu calc.
 1184          */
 1185         if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
 1186                 /*
 1187                  * Shift the tick count out so that the divide doesn't
 1188                  * round away our results.
 1189                  */
 1190                 ke->ke_ticks <<= 10;
 1191                 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
 1192                             SCHED_CPU_TICKS;
 1193                 ke->ke_ticks >>= 10;
 1194         } else
 1195                 ke->ke_ticks = 0;
 1196         ke->ke_ltick = ticks;
 1197         ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 1198 }
 1199 
 1200 void
 1201 sched_prio(struct thread *td, u_char prio)
 1202 {
 1203         struct kse *ke;
 1204 
 1205         CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
 1206             td, td->td_proc->p_comm, td->td_priority, prio, curthread,
 1207             curthread->td_proc->p_comm);
 1208         ke = td->td_kse;
 1209         mtx_assert(&sched_lock, MA_OWNED);
 1210         if (TD_ON_RUNQ(td)) {
 1211                 /*
 1212                  * If the priority has been elevated due to priority
 1213                  * propagation, we may have to move ourselves to a new
 1214                  * queue.  We still call adjustrunqueue below in case kse
 1215                  * needs to fix things up.
 1216                  */
 1217                 if (prio < td->td_priority && ke->ke_runq != NULL &&
 1218                     (ke->ke_flags & KEF_ASSIGNED) == 0 &&
 1219                     ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
 1220                         runq_remove(ke->ke_runq, ke);
 1221                         ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
 1222                         runq_add(ke->ke_runq, ke, 0);
 1223                 }
 1224                 if (prio < td->td_priority)
 1225                         ke->ke_flags |= KEF_PRIOELEV;
 1226                 /*
 1227                  * Hold this kse on this cpu so that sched_prio() doesn't
 1228                  * cause excessive migration.  We only want migration to
 1229                  * happen as the result of a wakeup.
 1230                  */
 1231                 ke->ke_flags |= KEF_HOLD;
 1232                 adjustrunqueue(td, prio);
 1233         } else
 1234                 td->td_priority = prio;
 1235 }
 1236 
 1237 void
 1238 sched_switch(struct thread *td, struct thread *newtd, int flags)
 1239 {
 1240         struct kse *ke;
 1241 
 1242         mtx_assert(&sched_lock, MA_OWNED);
 1243 
 1244         ke = td->td_kse;
 1245 
 1246         td->td_lastcpu = td->td_oncpu;
 1247         td->td_oncpu = NOCPU;
 1248         td->td_flags &= ~TDF_NEEDRESCHED;
 1249         td->td_pflags &= ~TDP_OWEPREEMPT;
 1250 
 1251         /*
 1252          * If the KSE has been assigned it may be in the process of switching
 1253          * to the new cpu.  This is the case in sched_bind().
 1254          */
 1255         if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
 1256                 if (td == PCPU_GET(idlethread)) {
 1257                         TD_SET_CAN_RUN(td);
 1258                 } else {
 1259                         /* We are ending our run so make our slot available again */
 1260                         SLOT_RELEASE(td->td_ksegrp);
 1261                         if (TD_IS_RUNNING(td)) {
 1262                                 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 1263                                 /*
 1264                                  * Don't allow the thread to migrate
 1265                                  * from a preemption.
 1266                                  */
 1267                                 ke->ke_flags |= KEF_HOLD;
 1268                                 setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
 1269                         } else {
 1270                                 if (ke->ke_runq) {
 1271                                         kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 1272                                 } else if ((td->td_flags & TDF_IDLETD) == 0)
 1273                                         kdb_backtrace();
 1274                                 /*
 1275                                  * We will not be on the run queue.
 1276                                  * So we must be sleeping or similar.
 1277                                  * Don't use the slot if we will need it 
 1278                                  * for newtd.
 1279                                  */
 1280                                 if ((td->td_proc->p_flag & P_HADTHREADS) &&
 1281                                     (newtd == NULL ||
 1282                                     newtd->td_ksegrp != td->td_ksegrp))
 1283                                         slot_fill(td->td_ksegrp);
 1284                         }
 1285                 }
 1286         }
 1287         if (newtd != NULL) {
 1288                 /*
 1289                  * If we bring in a thread, 
 1290                  * then account for it as if it had been added to the
 1291                  * run queue and then chosen.
 1292                  */
 1293                 newtd->td_kse->ke_flags |= KEF_DIDRUN;
 1294                 SLOT_USE(newtd->td_ksegrp);
 1295                 TD_SET_RUNNING(newtd);
 1296                 kseq_load_add(KSEQ_SELF(), newtd->td_kse);
 1297         } else
 1298                 newtd = choosethread();
 1299         if (td != newtd)
 1300                 cpu_switch(td, newtd);
 1301         sched_lock.mtx_lock = (uintptr_t)td;
 1302 
 1303         td->td_oncpu = PCPU_GET(cpuid);
 1304 }
 1305 
 1306 void
 1307 sched_nice(struct proc *p, int nice)
 1308 {
 1309         struct ksegrp *kg;
 1310         struct kse *ke;
 1311         struct thread *td;
 1312         struct kseq *kseq;
 1313 
 1314         PROC_LOCK_ASSERT(p, MA_OWNED);
 1315         mtx_assert(&sched_lock, MA_OWNED);
 1316         /*
 1317          * We need to adjust the nice counts for running KSEs.
 1318          */
 1319         FOREACH_KSEGRP_IN_PROC(p, kg) {
 1320                 if (kg->kg_pri_class == PRI_TIMESHARE) {
 1321                         FOREACH_THREAD_IN_GROUP(kg, td) {
 1322                                 ke = td->td_kse;
 1323                                 if (ke->ke_runq == NULL)
 1324                                         continue;
 1325                                 kseq = KSEQ_CPU(ke->ke_cpu);
 1326                                 kseq_nice_rem(kseq, p->p_nice);
 1327                                 kseq_nice_add(kseq, nice);
 1328                         }
 1329                 }
 1330         }
 1331         p->p_nice = nice;
 1332         FOREACH_KSEGRP_IN_PROC(p, kg) {
 1333                 sched_priority(kg);
 1334                 FOREACH_THREAD_IN_GROUP(kg, td)
 1335                         td->td_flags |= TDF_NEEDRESCHED;
 1336         }
 1337 }
 1338 
 1339 void
 1340 sched_sleep(struct thread *td)
 1341 {
 1342         mtx_assert(&sched_lock, MA_OWNED);
 1343 
 1344         td->td_slptime = ticks;
 1345         td->td_base_pri = td->td_priority;
 1346 }
 1347 
 1348 void
 1349 sched_wakeup(struct thread *td)
 1350 {
 1351         mtx_assert(&sched_lock, MA_OWNED);
 1352 
 1353         /*
 1354          * Let the kseg know how long we slept for.  This is because process
 1355          * interactivity behavior is modeled in the kseg.
 1356          */
 1357         if (td->td_slptime) {
 1358                 struct ksegrp *kg;
 1359                 int hzticks;
 1360 
 1361                 kg = td->td_ksegrp;
 1362                 hzticks = (ticks - td->td_slptime) << 10;
 1363                 if (hzticks >= SCHED_SLP_RUN_MAX) {
 1364                         kg->kg_slptime = SCHED_SLP_RUN_MAX;
 1365                         kg->kg_runtime = 1;
 1366                 } else {
 1367                         kg->kg_slptime += hzticks;
 1368                         sched_interact_update(kg);
 1369                 }
 1370                 sched_priority(kg);
 1371                 sched_slice(td->td_kse);
 1372                 td->td_slptime = 0;
 1373         }
 1374         setrunqueue(td, SRQ_BORING);
 1375 }
 1376 
 1377 /*
 1378  * Penalize the parent for creating a new child and initialize the child's
 1379  * priority.
 1380  */
 1381 void
 1382 sched_fork(struct thread *td, struct thread *childtd)
 1383 {
 1384 
 1385         mtx_assert(&sched_lock, MA_OWNED);
 1386 
 1387         sched_fork_ksegrp(td, childtd->td_ksegrp);
 1388         sched_fork_thread(td, childtd);
 1389 }
 1390 
 1391 void
 1392 sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
 1393 {
 1394         struct ksegrp *kg = td->td_ksegrp;
 1395         mtx_assert(&sched_lock, MA_OWNED);
 1396 
 1397         child->kg_slptime = kg->kg_slptime;
 1398         child->kg_runtime = kg->kg_runtime;
 1399         child->kg_user_pri = kg->kg_user_pri;
 1400         sched_interact_fork(child);
 1401         kg->kg_runtime += tickincr << 10;
 1402         sched_interact_update(kg);
 1403 }
 1404 
 1405 void
 1406 sched_fork_thread(struct thread *td, struct thread *child)
 1407 {
 1408         struct kse *ke;
 1409         struct kse *ke2;
 1410 
 1411         sched_newthread(child);
 1412         ke = td->td_kse;
 1413         ke2 = child->td_kse;
 1414         ke2->ke_slice = 1;      /* Attempt to quickly learn interactivity. */
 1415         ke2->ke_cpu = ke->ke_cpu;
 1416         ke2->ke_runq = NULL;
 1417 
 1418         /* Grab our parents cpu estimation information. */
 1419         ke2->ke_ticks = ke->ke_ticks;
 1420         ke2->ke_ltick = ke->ke_ltick;
 1421         ke2->ke_ftick = ke->ke_ftick;
 1422 }
 1423 
 1424 void
 1425 sched_class(struct ksegrp *kg, int class)
 1426 {
 1427         struct kseq *kseq;
 1428         struct kse *ke;
 1429         struct thread *td;
 1430         int nclass;
 1431         int oclass;
 1432 
 1433         mtx_assert(&sched_lock, MA_OWNED);
 1434         if (kg->kg_pri_class == class)
 1435                 return;
 1436 
 1437         nclass = PRI_BASE(class);
 1438         oclass = PRI_BASE(kg->kg_pri_class);
 1439         FOREACH_THREAD_IN_GROUP(kg, td) {
 1440                 ke = td->td_kse;
 1441                 if ((ke->ke_state != KES_ONRUNQ &&
 1442                     ke->ke_state != KES_THREAD) || ke->ke_runq == NULL)
 1443                         continue;
 1444                 kseq = KSEQ_CPU(ke->ke_cpu);
 1445 
 1446 #ifdef SMP
 1447                 /*
 1448                  * On SMP if we're on the RUNQ we must adjust the transferable
 1449                  * count because could be changing to or from an interrupt
 1450                  * class.
 1451                  */
 1452                 if (ke->ke_state == KES_ONRUNQ) {
 1453                         if (KSE_CAN_MIGRATE(ke, oclass)) {
 1454                                 kseq->ksq_transferable--;
 1455                                 kseq->ksq_group->ksg_transferable--;
 1456                         }
 1457                         if (KSE_CAN_MIGRATE(ke, nclass)) {
 1458                                 kseq->ksq_transferable++;
 1459                                 kseq->ksq_group->ksg_transferable++;
 1460                         }
 1461                 }
 1462 #endif
 1463                 if (oclass == PRI_TIMESHARE) {
 1464                         kseq->ksq_load_timeshare--;
 1465                         kseq_nice_rem(kseq, kg->kg_proc->p_nice);
 1466                 }
 1467                 if (nclass == PRI_TIMESHARE) {
 1468                         kseq->ksq_load_timeshare++;
 1469                         kseq_nice_add(kseq, kg->kg_proc->p_nice);
 1470                 }
 1471         }
 1472 
 1473         kg->kg_pri_class = class;
 1474 }
 1475 
 1476 /*
 1477  * Return some of the child's priority and interactivity to the parent.
 1478  */
 1479 void
 1480 sched_exit(struct proc *p, struct thread *childtd)
 1481 {
 1482         mtx_assert(&sched_lock, MA_OWNED);
 1483         sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
 1484         sched_exit_thread(NULL, childtd);
 1485 }
 1486 
 1487 void
 1488 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
 1489 {
 1490         /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
 1491         kg->kg_runtime += td->td_ksegrp->kg_runtime;
 1492         sched_interact_update(kg);
 1493 }
 1494 
 1495 void
 1496 sched_exit_thread(struct thread *td, struct thread *childtd)
 1497 {
 1498         CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
 1499             childtd, childtd->td_proc->p_comm, childtd->td_priority);
 1500         kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
 1501 }
 1502 
 1503 void
 1504 sched_clock(struct thread *td)
 1505 {
 1506         struct kseq *kseq;
 1507         struct ksegrp *kg;
 1508         struct kse *ke;
 1509 
 1510         mtx_assert(&sched_lock, MA_OWNED);
 1511         kseq = KSEQ_SELF();
 1512 #ifdef SMP
 1513         if (ticks == bal_tick)
 1514                 sched_balance();
 1515         if (ticks == gbal_tick)
 1516                 sched_balance_groups();
 1517         /*
 1518          * We could have been assigned a non real-time thread without an
 1519          * IPI.
 1520          */
 1521         if (kseq->ksq_assigned)
 1522                 kseq_assign(kseq);      /* Potentially sets NEEDRESCHED */
 1523 #endif
 1524         /*
 1525          * sched_setup() apparently happens prior to stathz being set.  We
 1526          * need to resolve the timers earlier in the boot so we can avoid
 1527          * calculating this here.
 1528          */
 1529         if (realstathz == 0) {
 1530                 realstathz = stathz ? stathz : hz;
 1531                 tickincr = hz / realstathz;
 1532                 /*
 1533                  * XXX This does not work for values of stathz that are much
 1534                  * larger than hz.
 1535                  */
 1536                 if (tickincr == 0)
 1537                         tickincr = 1;
 1538         }
 1539 
 1540         ke = td->td_kse;
 1541         kg = ke->ke_ksegrp;
 1542 
 1543         /* Adjust ticks for pctcpu */
 1544         ke->ke_ticks++;
 1545         ke->ke_ltick = ticks;
 1546 
 1547         /* Go up to one second beyond our max and then trim back down */
 1548         if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
 1549                 sched_pctcpu_update(ke);
 1550 
 1551         if (td->td_flags & TDF_IDLETD)
 1552                 return;
 1553         /*
 1554          * We only do slicing code for TIMESHARE ksegrps.
 1555          */
 1556         if (kg->kg_pri_class != PRI_TIMESHARE)
 1557                 return;
 1558         /*
 1559          * We used a tick charge it to the ksegrp so that we can compute our
 1560          * interactivity.
 1561          */
 1562         kg->kg_runtime += tickincr << 10;
 1563         sched_interact_update(kg);
 1564 
 1565         /*
 1566          * We used up one time slice.
 1567          */
 1568         if (--ke->ke_slice > 0)
 1569                 return;
 1570         /*
 1571          * We're out of time, recompute priorities and requeue.
 1572          */
 1573         kseq_load_rem(kseq, ke);
 1574         sched_priority(kg);
 1575         sched_slice(ke);
 1576         if (SCHED_CURR(kg, ke))
 1577                 ke->ke_runq = kseq->ksq_curr;
 1578         else
 1579                 ke->ke_runq = kseq->ksq_next;
 1580         kseq_load_add(kseq, ke);
 1581         td->td_flags |= TDF_NEEDRESCHED;
 1582 }
 1583 
 1584 int
 1585 sched_runnable(void)
 1586 {
 1587         struct kseq *kseq;
 1588         int load;
 1589 
 1590         load = 1;
 1591 
 1592         kseq = KSEQ_SELF();
 1593 #ifdef SMP
 1594         if (kseq->ksq_assigned) {
 1595                 mtx_lock_spin(&sched_lock);
 1596                 kseq_assign(kseq);
 1597                 mtx_unlock_spin(&sched_lock);
 1598         }
 1599 #endif
 1600         if ((curthread->td_flags & TDF_IDLETD) != 0) {
 1601                 if (kseq->ksq_load > 0)
 1602                         goto out;
 1603         } else
 1604                 if (kseq->ksq_load - 1 > 0)
 1605                         goto out;
 1606         load = 0;
 1607 out:
 1608         return (load);
 1609 }
 1610 
 1611 void
 1612 sched_userret(struct thread *td)
 1613 {
 1614         struct ksegrp *kg;
 1615         struct kse *ke;
 1616 
 1617         kg = td->td_ksegrp;
 1618         ke = td->td_kse;
 1619         
 1620         if (td->td_priority != kg->kg_user_pri ||
 1621             ke->ke_flags & KEF_PRIOELEV) {
 1622                 mtx_lock_spin(&sched_lock);
 1623                 td->td_priority = kg->kg_user_pri;
 1624                 if (ke->ke_flags & KEF_PRIOELEV) {
 1625                         ke->ke_flags &= ~KEF_PRIOELEV;
 1626                         sched_slice(ke);
 1627                         if (ke->ke_slice == 0)
 1628                                 mi_switch(SW_INVOL, NULL);
 1629                 }
 1630                 mtx_unlock_spin(&sched_lock);
 1631         }
 1632 }
 1633 
 1634 struct kse *
 1635 sched_choose(void)
 1636 {
 1637         struct kseq *kseq;
 1638         struct kse *ke;
 1639 
 1640         mtx_assert(&sched_lock, MA_OWNED);
 1641         kseq = KSEQ_SELF();
 1642 #ifdef SMP
 1643 restart:
 1644         if (kseq->ksq_assigned)
 1645                 kseq_assign(kseq);
 1646 #endif
 1647         ke = kseq_choose(kseq);
 1648         if (ke) {
 1649 #ifdef SMP
 1650                 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
 1651                         if (kseq_idled(kseq) == 0)
 1652                                 goto restart;
 1653 #endif
 1654                 kseq_runq_rem(kseq, ke);
 1655                 ke->ke_state = KES_THREAD;
 1656                 return (ke);
 1657         }
 1658 #ifdef SMP
 1659         if (kseq_idled(kseq) == 0)
 1660                 goto restart;
 1661 #endif
 1662         return (NULL);
 1663 }
 1664 
 1665 void
 1666 sched_add(struct thread *td, int flags)
 1667 {
 1668 
 1669         /* let jeff work out how to map the flags better */
 1670         /* I'm open to suggestions */
 1671         if (flags & SRQ_YIELDING)
 1672                 /*
 1673                  * Preempting during switching can be bad JUJU
 1674                  * especially for KSE processes
 1675                  */
 1676                 sched_add_internal(td, 0);
 1677         else
 1678                 sched_add_internal(td, 1);
 1679 }
 1680 
 1681 static void
 1682 sched_add_internal(struct thread *td, int preemptive)
 1683 {
 1684         struct kseq *kseq;
 1685         struct ksegrp *kg;
 1686         struct kse *ke;
 1687 #ifdef SMP
 1688         int canmigrate;
 1689 #endif
 1690         int class;
 1691 
 1692         CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 1693             td, td->td_proc->p_comm, td->td_priority, curthread,
 1694             curthread->td_proc->p_comm);
 1695         mtx_assert(&sched_lock, MA_OWNED);
 1696         ke = td->td_kse;
 1697         kg = td->td_ksegrp;
 1698         if (ke->ke_flags & KEF_ASSIGNED) {
 1699                 if (ke->ke_flags & KEF_REMOVED) {
 1700                         SLOT_USE(ke->ke_ksegrp);
 1701                         ke->ke_flags &= ~KEF_REMOVED;
 1702                 }
 1703                 return;
 1704         }
 1705         kseq = KSEQ_SELF();
 1706         KASSERT(ke->ke_state != KES_ONRUNQ,
 1707             ("sched_add: kse %p (%s) already in run queue", ke,
 1708             ke->ke_proc->p_comm));
 1709         KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 1710             ("sched_add: process swapped out"));
 1711         KASSERT(ke->ke_runq == NULL,
 1712             ("sched_add: KSE %p is still assigned to a run queue", ke));
 1713 
 1714         class = PRI_BASE(kg->kg_pri_class);
 1715         switch (class) {
 1716         case PRI_ITHD:
 1717         case PRI_REALTIME:
 1718                 ke->ke_runq = kseq->ksq_curr;
 1719                 ke->ke_slice = SCHED_SLICE_MAX;
 1720                 ke->ke_cpu = PCPU_GET(cpuid);
 1721                 break;
 1722         case PRI_TIMESHARE:
 1723                 if (SCHED_CURR(kg, ke))
 1724                         ke->ke_runq = kseq->ksq_curr;
 1725                 else
 1726                         ke->ke_runq = kseq->ksq_next;
 1727                 break;
 1728         case PRI_IDLE:
 1729                 /*
 1730                  * This is for priority prop.
 1731                  */
 1732                 if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
 1733                         ke->ke_runq = kseq->ksq_curr;
 1734                 else
 1735                         ke->ke_runq = &kseq->ksq_idle;
 1736                 ke->ke_slice = SCHED_SLICE_MIN;
 1737                 break;
 1738         default:
 1739                 panic("Unknown pri class.");
 1740                 break;
 1741         }
 1742 #ifdef SMP
 1743         /*
 1744          * Don't migrate running threads here.  Force the long term balancer
 1745          * to do it.
 1746          */
 1747         canmigrate = KSE_CAN_MIGRATE(ke, class);
 1748         if (ke->ke_flags & KEF_HOLD) {
 1749                 ke->ke_flags &= ~KEF_HOLD;
 1750                 canmigrate = 0;
 1751         }
 1752         /*
 1753          * If this thread is pinned or bound, notify the target cpu.
 1754          */
 1755         if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
 1756                 ke->ke_runq = NULL;
 1757                 kseq_notify(ke, ke->ke_cpu);
 1758                 return;
 1759         }
 1760         /*
 1761          * If we had been idle, clear our bit in the group and potentially
 1762          * the global bitmap.  If not, see if we should transfer this thread.
 1763          */
 1764         if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
 1765             (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
 1766                 /*
 1767                  * Check to see if our group is unidling, and if so, remove it
 1768                  * from the global idle mask.
 1769                  */
 1770                 if (kseq->ksq_group->ksg_idlemask ==
 1771                     kseq->ksq_group->ksg_cpumask)
 1772                         atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
 1773                 /*
 1774                  * Now remove ourselves from the group specific idle mask.
 1775                  */
 1776                 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
 1777         } else if (kseq->ksq_load > 1 && canmigrate)
 1778                 if (kseq_transfer(kseq, ke, class))
 1779                         return;
 1780         ke->ke_cpu = PCPU_GET(cpuid);
 1781 #endif
 1782         /*
 1783          * XXX With preemption this is not necessary.
 1784          */
 1785         if (td->td_priority < curthread->td_priority &&
 1786             ke->ke_runq == kseq->ksq_curr)
 1787                 curthread->td_flags |= TDF_NEEDRESCHED;
 1788         if (preemptive && maybe_preempt(td))
 1789                 return;
 1790         SLOT_USE(td->td_ksegrp);
 1791         ke->ke_state = KES_ONRUNQ;
 1792 
 1793         kseq_runq_add(kseq, ke);
 1794         kseq_load_add(kseq, ke);
 1795 }
 1796 
 1797 void
 1798 sched_rem(struct thread *td)
 1799 {
 1800         struct kseq *kseq;
 1801         struct kse *ke;
 1802 
 1803         mtx_assert(&sched_lock, MA_OWNED);
 1804         ke = td->td_kse;
 1805         CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
 1806             td, td->td_proc->p_comm, td->td_priority, curthread,
 1807             curthread->td_proc->p_comm);
 1808         /*
 1809          * It is safe to just return here because sched_rem() is only ever
 1810          * used in places where we're immediately going to add the
 1811          * kse back on again.  In that case it'll be added with the correct
 1812          * thread and priority when the caller drops the sched_lock.
 1813          */
 1814         if (ke->ke_flags & KEF_ASSIGNED) {
 1815                 SLOT_RELEASE(td->td_ksegrp);
 1816                 ke->ke_flags |= KEF_REMOVED;
 1817                 return;
 1818         }
 1819         KASSERT((ke->ke_state == KES_ONRUNQ),
 1820             ("sched_rem: KSE not on run queue"));
 1821 
 1822         SLOT_RELEASE(td->td_ksegrp);
 1823         ke->ke_state = KES_THREAD;
 1824         kseq = KSEQ_CPU(ke->ke_cpu);
 1825         kseq_runq_rem(kseq, ke);
 1826         kseq_load_rem(kseq, ke);
 1827 }
 1828 
 1829 fixpt_t
 1830 sched_pctcpu(struct thread *td)
 1831 {
 1832         fixpt_t pctcpu;
 1833         struct kse *ke;
 1834 
 1835         pctcpu = 0;
 1836         ke = td->td_kse;
 1837         if (ke == NULL)
 1838                 return (0);
 1839 
 1840         mtx_lock_spin(&sched_lock);
 1841         if (ke->ke_ticks) {
 1842                 int rtick;
 1843 
 1844                 /*
 1845                  * Don't update more frequently than twice a second.  Allowing
 1846                  * this causes the cpu usage to decay away too quickly due to
 1847                  * rounding errors.
 1848                  */
 1849                 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
 1850                     ke->ke_ltick < (ticks - (hz / 2)))
 1851                         sched_pctcpu_update(ke);
 1852                 /* How many rtick per second ? */
 1853                 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
 1854                 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
 1855         }
 1856 
 1857         ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
 1858         mtx_unlock_spin(&sched_lock);
 1859 
 1860         return (pctcpu);
 1861 }
 1862 
 1863 void
 1864 sched_bind(struct thread *td, int cpu)
 1865 {
 1866         struct kse *ke;
 1867 
 1868         mtx_assert(&sched_lock, MA_OWNED);
 1869         ke = td->td_kse;
 1870         ke->ke_flags |= KEF_BOUND;
 1871 #ifdef SMP
 1872         if (PCPU_GET(cpuid) == cpu)
 1873                 return;
 1874         /* sched_rem without the runq_remove */
 1875         ke->ke_state = KES_THREAD;
 1876         kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 1877         kseq_notify(ke, cpu);
 1878         /* When we return from mi_switch we'll be on the correct cpu. */
 1879         mi_switch(SW_VOL, NULL);
 1880 #endif
 1881 }
 1882 
 1883 void
 1884 sched_unbind(struct thread *td)
 1885 {
 1886         mtx_assert(&sched_lock, MA_OWNED);
 1887         td->td_kse->ke_flags &= ~KEF_BOUND;
 1888 }
 1889 
 1890 int
 1891 sched_load(void)
 1892 {
 1893 #ifdef SMP
 1894         int total;
 1895         int i;
 1896 
 1897         total = 0;
 1898         for (i = 0; i <= ksg_maxid; i++)
 1899                 total += KSEQ_GROUP(i)->ksg_load;
 1900         return (total);
 1901 #else
 1902         return (KSEQ_SELF()->ksq_sysload);
 1903 #endif
 1904 }
 1905 
 1906 int
 1907 sched_sizeof_ksegrp(void)
 1908 {
 1909         return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 1910 }
 1911 
 1912 int
 1913 sched_sizeof_proc(void)
 1914 {
 1915         return (sizeof(struct proc));
 1916 }
 1917 
 1918 int
 1919 sched_sizeof_thread(void)
 1920 {
 1921         return (sizeof(struct thread) + sizeof(struct td_sched));
 1922 }
 1923 #define KERN_SWITCH_INCLUDE 1
 1924 #include "kern/kern_switch.c"
Cache object: f03fe06927cd9d486765d83e89c50409
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/sched_ule.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/sched_ule.c