sched_ule.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2002-2005, Jeffrey Roberson <jeff@freebsd.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice unmodified, this list of conditions, and the following
   10  *    disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/6.4/sys/kern/sched_ule.c 150613 2005-09-27 12:00:31Z davidxu $");
   29 
   30 #include "opt_hwpmc_hooks.h"
   31 #include "opt_sched.h"
   32 
   33 #define kse td_sched
   34 
   35 #include <sys/param.h>
   36 #include <sys/systm.h>
   37 #include <sys/kdb.h>
   38 #include <sys/kernel.h>
   39 #include <sys/ktr.h>
   40 #include <sys/lock.h>
   41 #include <sys/mutex.h>
   42 #include <sys/proc.h>
   43 #include <sys/resource.h>
   44 #include <sys/resourcevar.h>
   45 #include <sys/sched.h>
   46 #include <sys/smp.h>
   47 #include <sys/sx.h>
   48 #include <sys/sysctl.h>
   49 #include <sys/sysproto.h>
   50 #include <sys/turnstile.h>
   51 #include <sys/vmmeter.h>
   52 #ifdef KTRACE
   53 #include <sys/uio.h>
   54 #include <sys/ktrace.h>
   55 #endif
   56 
   57 #ifdef HWPMC_HOOKS
   58 #include <sys/pmckern.h>
   59 #endif
   60 
   61 #include <machine/cpu.h>
   62 #include <machine/smp.h>
   63 
   64 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
   65 /* XXX This is bogus compatability crap for ps */
   66 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
   67 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
   68 
   69 static void sched_setup(void *dummy);
   70 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
   71 
   72 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
   73 
   74 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
   75     "Scheduler name");
   76 
   77 static int slice_min = 1;
   78 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
   79 
   80 static int slice_max = 10;
   81 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
   82 
   83 int realstathz;
   84 int tickincr = 1;
   85 
   86 /*
   87  * The following datastructures are allocated within their parent structure
   88  * but are scheduler specific.
   89  */
   90 /*
   91  * The schedulable entity that can be given a context to run.  A process may
   92  * have several of these.
   93  */
   94 struct kse {
   95         TAILQ_ENTRY(kse) ke_procq;      /* (j/z) Run queue. */
   96         int             ke_flags;       /* (j) KEF_* flags. */
   97         struct thread   *ke_thread;     /* (*) Active associated thread. */
   98         fixpt_t         ke_pctcpu;      /* (j) %cpu during p_swtime. */
   99         char            ke_rqindex;     /* (j) Run queue index. */
  100         enum {
  101                 KES_THREAD = 0x0,       /* slaved to thread state */
  102                 KES_ONRUNQ
  103         } ke_state;                     /* (j) thread sched specific status. */
  104         int             ke_slptime;
  105         int             ke_slice;
  106         struct runq     *ke_runq;
  107         u_char          ke_cpu;         /* CPU that we have affinity for. */
  108         /* The following variables are only used for pctcpu calculation */
  109         int             ke_ltick;       /* Last tick that we were running on */
  110         int             ke_ftick;       /* First tick that we were running on */
  111         int             ke_ticks;       /* Tick count */
  112 
  113 };
  114 #define td_kse                  td_sched
  115 #define td_slptime              td_kse->ke_slptime
  116 #define ke_proc                 ke_thread->td_proc
  117 #define ke_ksegrp               ke_thread->td_ksegrp
  118 #define ke_assign               ke_procq.tqe_next
  119 /* flags kept in ke_flags */
  120 #define KEF_ASSIGNED    0x0001          /* Thread is being migrated. */
  121 #define KEF_BOUND       0x0002          /* Thread can not migrate. */
  122 #define KEF_XFERABLE    0x0004          /* Thread was added as transferable. */
  123 #define KEF_HOLD        0x0008          /* Thread is temporarily bound. */
  124 #define KEF_REMOVED     0x0010          /* Thread was removed while ASSIGNED */
  125 #define KEF_INTERNAL    0x0020          /* Thread added due to migration. */
  126 #define KEF_PREEMPTED   0x0040          /* Thread was preempted */
  127 #define KEF_DIDRUN      0x02000         /* Thread actually ran. */
  128 #define KEF_EXIT        0x04000         /* Thread is being killed. */
  129 
  130 struct kg_sched {
  131         struct thread   *skg_last_assigned; /* (j) Last thread assigned to */
  132                                            /* the system scheduler */
  133         int     skg_slptime;            /* Number of ticks we vol. slept */
  134         int     skg_runtime;            /* Number of ticks we were running */
  135         int     skg_avail_opennings;    /* (j) Num unfilled slots in group.*/
  136         int     skg_concurrency;        /* (j) Num threads requested in group.*/
  137 };
  138 #define kg_last_assigned        kg_sched->skg_last_assigned
  139 #define kg_avail_opennings      kg_sched->skg_avail_opennings
  140 #define kg_concurrency          kg_sched->skg_concurrency
  141 #define kg_runtime              kg_sched->skg_runtime
  142 #define kg_slptime              kg_sched->skg_slptime
  143 
  144 #define SLOT_RELEASE(kg)        (kg)->kg_avail_opennings++
  145 #define SLOT_USE(kg)            (kg)->kg_avail_opennings--
  146 
  147 static struct kse kse0;
  148 static struct kg_sched kg_sched0;
  149 
  150 /*
  151  * The priority is primarily determined by the interactivity score.  Thus, we
  152  * give lower(better) priorities to kse groups that use less CPU.  The nice
  153  * value is then directly added to this to allow nice to have some effect
  154  * on latency.
  155  *
  156  * PRI_RANGE:   Total priority range for timeshare threads.
  157  * PRI_NRESV:   Number of nice values.
  158  * PRI_BASE:    The start of the dynamic range.
  159  */
  160 #define SCHED_PRI_RANGE         (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
  161 #define SCHED_PRI_NRESV         ((PRIO_MAX - PRIO_MIN) + 1)
  162 #define SCHED_PRI_NHALF         (SCHED_PRI_NRESV / 2)
  163 #define SCHED_PRI_BASE          (PRI_MIN_TIMESHARE)
  164 #define SCHED_PRI_INTERACT(score)                                       \
  165     ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
  166 
  167 /*
  168  * These determine the interactivity of a process.
  169  *
  170  * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
  171  *              before throttling back.
  172  * SLP_RUN_FORK:        Maximum slp+run time to inherit at fork time.
  173  * INTERACT_MAX:        Maximum interactivity value.  Smaller is better.
  174  * INTERACT_THRESH:     Threshhold for placement on the current runq.
  175  */
  176 #define SCHED_SLP_RUN_MAX       ((hz * 5) << 10)
  177 #define SCHED_SLP_RUN_FORK      ((hz / 2) << 10)
  178 #define SCHED_INTERACT_MAX      (100)
  179 #define SCHED_INTERACT_HALF     (SCHED_INTERACT_MAX / 2)
  180 #define SCHED_INTERACT_THRESH   (30)
  181 
  182 /*
  183  * These parameters and macros determine the size of the time slice that is
  184  * granted to each thread.
  185  *
  186  * SLICE_MIN:   Minimum time slice granted, in units of ticks.
  187  * SLICE_MAX:   Maximum time slice granted.
  188  * SLICE_RANGE: Range of available time slices scaled by hz.
  189  * SLICE_SCALE: The number slices granted per val in the range of [0, max].
  190  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
  191  * SLICE_NTHRESH:       The nice cutoff point for slice assignment.
  192  */
  193 #define SCHED_SLICE_MIN                 (slice_min)
  194 #define SCHED_SLICE_MAX                 (slice_max)
  195 #define SCHED_SLICE_INTERACTIVE         (slice_max)
  196 #define SCHED_SLICE_NTHRESH     (SCHED_PRI_NHALF - 1)
  197 #define SCHED_SLICE_RANGE               (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
  198 #define SCHED_SLICE_SCALE(val, max)     (((val) * SCHED_SLICE_RANGE) / (max))
  199 #define SCHED_SLICE_NICE(nice)                                          \
  200     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
  201 
  202 /*
  203  * This macro determines whether or not the thread belongs on the current or
  204  * next run queue.
  205  */
  206 #define SCHED_INTERACTIVE(kg)                                           \
  207     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
  208 #define SCHED_CURR(kg, ke)                                              \
  209     ((ke->ke_thread->td_flags & TDF_BORROWING) ||                       \
  210      (ke->ke_flags & KEF_PREEMPTED) || SCHED_INTERACTIVE(kg))
  211 
  212 /*
  213  * Cpu percentage computation macros and defines.
  214  *
  215  * SCHED_CPU_TIME:      Number of seconds to average the cpu usage across.
  216  * SCHED_CPU_TICKS:     Number of hz ticks to average the cpu usage across.
  217  */
  218 
  219 #define SCHED_CPU_TIME  10
  220 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
  221 
  222 /*
  223  * kseq - per processor runqs and statistics.
  224  */
  225 struct kseq {
  226         struct runq     ksq_idle;               /* Queue of IDLE threads. */
  227         struct runq     ksq_timeshare[2];       /* Run queues for !IDLE. */
  228         struct runq     *ksq_next;              /* Next timeshare queue. */
  229         struct runq     *ksq_curr;              /* Current queue. */
  230         int             ksq_load_timeshare;     /* Load for timeshare. */
  231         int             ksq_load;               /* Aggregate load. */
  232         short           ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
  233         short           ksq_nicemin;            /* Least nice. */
  234 #ifdef SMP
  235         int                     ksq_transferable;
  236         LIST_ENTRY(kseq)        ksq_siblings;   /* Next in kseq group. */
  237         struct kseq_group       *ksq_group;     /* Our processor group. */
  238         volatile struct kse     *ksq_assigned;  /* assigned by another CPU. */
  239 #else
  240         int             ksq_sysload;            /* For loadavg, !ITHD load. */
  241 #endif
  242 };
  243 
  244 #ifdef SMP
  245 /*
  246  * kseq groups are groups of processors which can cheaply share threads.  When
  247  * one processor in the group goes idle it will check the runqs of the other
  248  * processors in its group prior to halting and waiting for an interrupt.
  249  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
  250  * In a numa environment we'd want an idle bitmap per group and a two tiered
  251  * load balancer.
  252  */
  253 struct kseq_group {
  254         int     ksg_cpus;               /* Count of CPUs in this kseq group. */
  255         cpumask_t ksg_cpumask;          /* Mask of cpus in this group. */
  256         cpumask_t ksg_idlemask;         /* Idle cpus in this group. */
  257         cpumask_t ksg_mask;             /* Bit mask for first cpu. */
  258         int     ksg_load;               /* Total load of this group. */
  259         int     ksg_transferable;       /* Transferable load of this group. */
  260         LIST_HEAD(, kseq)       ksg_members; /* Linked list of all members. */
  261 };
  262 #endif
  263 
  264 /*
  265  * One kse queue per processor.
  266  */
  267 #ifdef SMP
  268 static cpumask_t kseq_idle;
  269 static int ksg_maxid;
  270 static struct kseq      kseq_cpu[MAXCPU];
  271 static struct kseq_group kseq_groups[MAXCPU];
  272 static int bal_tick;
  273 static int gbal_tick;
  274 static int balance_groups;
  275 
  276 #define KSEQ_SELF()     (&kseq_cpu[PCPU_GET(cpuid)])
  277 #define KSEQ_CPU(x)     (&kseq_cpu[(x)])
  278 #define KSEQ_ID(x)      ((x) - kseq_cpu)
  279 #define KSEQ_GROUP(x)   (&kseq_groups[(x)])
  280 #else   /* !SMP */
  281 static struct kseq      kseq_cpu;
  282 
  283 #define KSEQ_SELF()     (&kseq_cpu)
  284 #define KSEQ_CPU(x)     (&kseq_cpu)
  285 #endif
  286 
  287 static void slot_fill(struct ksegrp *);
  288 static struct kse *sched_choose(void);          /* XXX Should be thread * */
  289 static void sched_slice(struct kse *);
  290 static void sched_priority(struct ksegrp *);
  291 static void sched_thread_priority(struct thread *, u_char);
  292 static int sched_interact_score(struct ksegrp *);
  293 static void sched_interact_update(struct ksegrp *);
  294 static void sched_interact_fork(struct ksegrp *);
  295 static void sched_pctcpu_update(struct kse *);
  296 
  297 /* Operations on per processor queues */
  298 static struct kse * kseq_choose(struct kseq *);
  299 static void kseq_setup(struct kseq *);
  300 static void kseq_load_add(struct kseq *, struct kse *);
  301 static void kseq_load_rem(struct kseq *, struct kse *);
  302 static __inline void kseq_runq_add(struct kseq *, struct kse *, int);
  303 static __inline void kseq_runq_rem(struct kseq *, struct kse *);
  304 static void kseq_nice_add(struct kseq *, int);
  305 static void kseq_nice_rem(struct kseq *, int);
  306 void kseq_print(int cpu);
  307 #ifdef SMP
  308 static int kseq_transfer(struct kseq *, struct kse *, int);
  309 static struct kse *runq_steal(struct runq *);
  310 static void sched_balance(void);
  311 static void sched_balance_groups(void);
  312 static void sched_balance_group(struct kseq_group *);
  313 static void sched_balance_pair(struct kseq *, struct kseq *);
  314 static void kseq_move(struct kseq *, int);
  315 static int kseq_idled(struct kseq *);
  316 static void kseq_notify(struct kse *, int);
  317 static void kseq_assign(struct kseq *);
  318 static struct kse *kseq_steal(struct kseq *, int);
  319 #define KSE_CAN_MIGRATE(ke)                                             \
  320     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
  321 #endif
  322 
  323 void
  324 kseq_print(int cpu)
  325 {
  326         struct kseq *kseq;
  327         int i;
  328 
  329         kseq = KSEQ_CPU(cpu);
  330 
  331         printf("kseq:\n");
  332         printf("\tload:           %d\n", kseq->ksq_load);
  333         printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
  334 #ifdef SMP
  335         printf("\tload transferable: %d\n", kseq->ksq_transferable);
  336 #endif
  337         printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
  338         printf("\tnice counts:\n");
  339         for (i = 0; i < SCHED_PRI_NRESV; i++)
  340                 if (kseq->ksq_nice[i])
  341                         printf("\t\t%d = %d\n",
  342                             i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
  343 }
  344 
  345 static __inline void
  346 kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags)
  347 {
  348 #ifdef SMP
  349         if (KSE_CAN_MIGRATE(ke)) {
  350                 kseq->ksq_transferable++;
  351                 kseq->ksq_group->ksg_transferable++;
  352                 ke->ke_flags |= KEF_XFERABLE;
  353         }
  354 #endif
  355         if (ke->ke_flags & KEF_PREEMPTED)
  356                 flags |= SRQ_PREEMPTED;
  357         runq_add(ke->ke_runq, ke, flags);
  358 }
  359 
  360 static __inline void
  361 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
  362 {
  363 #ifdef SMP
  364         if (ke->ke_flags & KEF_XFERABLE) {
  365                 kseq->ksq_transferable--;
  366                 kseq->ksq_group->ksg_transferable--;
  367                 ke->ke_flags &= ~KEF_XFERABLE;
  368         }
  369 #endif
  370         runq_remove(ke->ke_runq, ke);
  371 }
  372 
  373 static void
  374 kseq_load_add(struct kseq *kseq, struct kse *ke)
  375 {
  376         int class;
  377         mtx_assert(&sched_lock, MA_OWNED);
  378         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
  379         if (class == PRI_TIMESHARE)
  380                 kseq->ksq_load_timeshare++;
  381         kseq->ksq_load++;
  382         CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
  383         if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
  384 #ifdef SMP
  385                 kseq->ksq_group->ksg_load++;
  386 #else
  387                 kseq->ksq_sysload++;
  388 #endif
  389         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
  390                 kseq_nice_add(kseq, ke->ke_proc->p_nice);
  391 }
  392 
  393 static void
  394 kseq_load_rem(struct kseq *kseq, struct kse *ke)
  395 {
  396         int class;
  397         mtx_assert(&sched_lock, MA_OWNED);
  398         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
  399         if (class == PRI_TIMESHARE)
  400                 kseq->ksq_load_timeshare--;
  401         if (class != PRI_ITHD  && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
  402 #ifdef SMP
  403                 kseq->ksq_group->ksg_load--;
  404 #else
  405                 kseq->ksq_sysload--;
  406 #endif
  407         kseq->ksq_load--;
  408         CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
  409         ke->ke_runq = NULL;
  410         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
  411                 kseq_nice_rem(kseq, ke->ke_proc->p_nice);
  412 }
  413 
  414 static void
  415 kseq_nice_add(struct kseq *kseq, int nice)
  416 {
  417         mtx_assert(&sched_lock, MA_OWNED);
  418         /* Normalize to zero. */
  419         kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
  420         if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
  421                 kseq->ksq_nicemin = nice;
  422 }
  423 
  424 static void
  425 kseq_nice_rem(struct kseq *kseq, int nice) 
  426 {
  427         int n;
  428 
  429         mtx_assert(&sched_lock, MA_OWNED);
  430         /* Normalize to zero. */
  431         n = nice + SCHED_PRI_NHALF;
  432         kseq->ksq_nice[n]--;
  433         KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
  434 
  435         /*
  436          * If this wasn't the smallest nice value or there are more in
  437          * this bucket we can just return.  Otherwise we have to recalculate
  438          * the smallest nice.
  439          */
  440         if (nice != kseq->ksq_nicemin ||
  441             kseq->ksq_nice[n] != 0 ||
  442             kseq->ksq_load_timeshare == 0)
  443                 return;
  444 
  445         for (; n < SCHED_PRI_NRESV; n++)
  446                 if (kseq->ksq_nice[n]) {
  447                         kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
  448                         return;
  449                 }
  450 }
  451 
  452 #ifdef SMP
  453 /*
  454  * sched_balance is a simple CPU load balancing algorithm.  It operates by
  455  * finding the least loaded and most loaded cpu and equalizing their load
  456  * by migrating some processes.
  457  *
  458  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
  459  * installations will only have 2 cpus.  Secondly, load balancing too much at
  460  * once can have an unpleasant effect on the system.  The scheduler rarely has
  461  * enough information to make perfect decisions.  So this algorithm chooses
  462  * algorithm simplicity and more gradual effects on load in larger systems.
  463  *
  464  * It could be improved by considering the priorities and slices assigned to
  465  * each task prior to balancing them.  There are many pathological cases with
  466  * any approach and so the semi random algorithm below may work as well as any.
  467  *
  468  */
  469 static void
  470 sched_balance(void)
  471 {
  472         struct kseq_group *high;
  473         struct kseq_group *low;
  474         struct kseq_group *ksg;
  475         int cnt;
  476         int i;
  477 
  478         bal_tick = ticks + (random() % (hz * 2));
  479         if (smp_started == 0)
  480                 return;
  481         low = high = NULL;
  482         i = random() % (ksg_maxid + 1);
  483         for (cnt = 0; cnt <= ksg_maxid; cnt++) {
  484                 ksg = KSEQ_GROUP(i);
  485                 /*
  486                  * Find the CPU with the highest load that has some
  487                  * threads to transfer.
  488                  */
  489                 if ((high == NULL || ksg->ksg_load > high->ksg_load)
  490                     && ksg->ksg_transferable)
  491                         high = ksg;
  492                 if (low == NULL || ksg->ksg_load < low->ksg_load)
  493                         low = ksg;
  494                 if (++i > ksg_maxid)
  495                         i = 0;
  496         }
  497         if (low != NULL && high != NULL && high != low)
  498                 sched_balance_pair(LIST_FIRST(&high->ksg_members),
  499                     LIST_FIRST(&low->ksg_members));
  500 }
  501 
  502 static void
  503 sched_balance_groups(void)
  504 {
  505         int i;
  506 
  507         gbal_tick = ticks + (random() % (hz * 2));
  508         mtx_assert(&sched_lock, MA_OWNED);
  509         if (smp_started)
  510                 for (i = 0; i <= ksg_maxid; i++)
  511                         sched_balance_group(KSEQ_GROUP(i));
  512 }
  513 
  514 static void
  515 sched_balance_group(struct kseq_group *ksg)
  516 {
  517         struct kseq *kseq;
  518         struct kseq *high;
  519         struct kseq *low;
  520         int load;
  521 
  522         if (ksg->ksg_transferable == 0)
  523                 return;
  524         low = NULL;
  525         high = NULL;
  526         LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
  527                 load = kseq->ksq_load;
  528                 if (high == NULL || load > high->ksq_load)
  529                         high = kseq;
  530                 if (low == NULL || load < low->ksq_load)
  531                         low = kseq;
  532         }
  533         if (high != NULL && low != NULL && high != low)
  534                 sched_balance_pair(high, low);
  535 }
  536 
  537 static void
  538 sched_balance_pair(struct kseq *high, struct kseq *low)
  539 {
  540         int transferable;
  541         int high_load;
  542         int low_load;
  543         int move;
  544         int diff;
  545         int i;
  546 
  547         /*
  548          * If we're transfering within a group we have to use this specific
  549          * kseq's transferable count, otherwise we can steal from other members
  550          * of the group.
  551          */
  552         if (high->ksq_group == low->ksq_group) {
  553                 transferable = high->ksq_transferable;
  554                 high_load = high->ksq_load;
  555                 low_load = low->ksq_load;
  556         } else {
  557                 transferable = high->ksq_group->ksg_transferable;
  558                 high_load = high->ksq_group->ksg_load;
  559                 low_load = low->ksq_group->ksg_load;
  560         }
  561         if (transferable == 0)
  562                 return;
  563         /*
  564          * Determine what the imbalance is and then adjust that to how many
  565          * kses we actually have to give up (transferable).
  566          */
  567         diff = high_load - low_load;
  568         move = diff / 2;
  569         if (diff & 0x1)
  570                 move++;
  571         move = min(move, transferable);
  572         for (i = 0; i < move; i++)
  573                 kseq_move(high, KSEQ_ID(low));
  574         return;
  575 }
  576 
  577 static void
  578 kseq_move(struct kseq *from, int cpu)
  579 {
  580         struct kseq *kseq;
  581         struct kseq *to;
  582         struct kse *ke;
  583 
  584         kseq = from;
  585         to = KSEQ_CPU(cpu);
  586         ke = kseq_steal(kseq, 1);
  587         if (ke == NULL) {
  588                 struct kseq_group *ksg;
  589 
  590                 ksg = kseq->ksq_group;
  591                 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
  592                         if (kseq == from || kseq->ksq_transferable == 0)
  593                                 continue;
  594                         ke = kseq_steal(kseq, 1);
  595                         break;
  596                 }
  597                 if (ke == NULL)
  598                         panic("kseq_move: No KSEs available with a "
  599                             "transferable count of %d\n", 
  600                             ksg->ksg_transferable);
  601         }
  602         if (kseq == to)
  603                 return;
  604         ke->ke_state = KES_THREAD;
  605         kseq_runq_rem(kseq, ke);
  606         kseq_load_rem(kseq, ke);
  607         kseq_notify(ke, cpu);
  608 }
  609 
  610 static int
  611 kseq_idled(struct kseq *kseq)
  612 {
  613         struct kseq_group *ksg;
  614         struct kseq *steal;
  615         struct kse *ke;
  616 
  617         ksg = kseq->ksq_group;
  618         /*
  619          * If we're in a cpu group, try and steal kses from another cpu in
  620          * the group before idling.
  621          */
  622         if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
  623                 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
  624                         if (steal == kseq || steal->ksq_transferable == 0)
  625                                 continue;
  626                         ke = kseq_steal(steal, 0);
  627                         if (ke == NULL)
  628                                 continue;
  629                         ke->ke_state = KES_THREAD;
  630                         kseq_runq_rem(steal, ke);
  631                         kseq_load_rem(steal, ke);
  632                         ke->ke_cpu = PCPU_GET(cpuid);
  633                         ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
  634                         sched_add(ke->ke_thread, SRQ_YIELDING);
  635                         return (0);
  636                 }
  637         }
  638         /*
  639          * We only set the idled bit when all of the cpus in the group are
  640          * idle.  Otherwise we could get into a situation where a KSE bounces
  641          * back and forth between two idle cores on seperate physical CPUs.
  642          */
  643         ksg->ksg_idlemask |= PCPU_GET(cpumask);
  644         if (ksg->ksg_idlemask != ksg->ksg_cpumask)
  645                 return (1);
  646         atomic_set_int(&kseq_idle, ksg->ksg_mask);
  647         return (1);
  648 }
  649 
  650 static void
  651 kseq_assign(struct kseq *kseq)
  652 {
  653         struct kse *nke;
  654         struct kse *ke;
  655 
  656         do {
  657                 *(volatile struct kse **)&ke = kseq->ksq_assigned;
  658         } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
  659                 (uintptr_t)ke, (uintptr_t)NULL));
  660         for (; ke != NULL; ke = nke) {
  661                 nke = ke->ke_assign;
  662                 kseq->ksq_group->ksg_load--;
  663                 kseq->ksq_load--;
  664                 ke->ke_flags &= ~KEF_ASSIGNED;
  665                 if (ke->ke_flags & KEF_REMOVED) {
  666                         ke->ke_flags &= ~KEF_REMOVED;
  667                         continue;
  668                 }
  669                 ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
  670                 sched_add(ke->ke_thread, SRQ_YIELDING);
  671         }
  672 }
  673 
  674 static void
  675 kseq_notify(struct kse *ke, int cpu)
  676 {
  677         struct kseq *kseq;
  678         struct thread *td;
  679         struct pcpu *pcpu;
  680         int class;
  681         int prio;
  682 
  683         kseq = KSEQ_CPU(cpu);
  684         /* XXX */
  685         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
  686         if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
  687             (kseq_idle & kseq->ksq_group->ksg_mask)) 
  688                 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
  689         kseq->ksq_group->ksg_load++;
  690         kseq->ksq_load++;
  691         ke->ke_cpu = cpu;
  692         ke->ke_flags |= KEF_ASSIGNED;
  693         prio = ke->ke_thread->td_priority;
  694 
  695         /*
  696          * Place a KSE on another cpu's queue and force a resched.
  697          */
  698         do {
  699                 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
  700         } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
  701                 (uintptr_t)ke->ke_assign, (uintptr_t)ke));
  702         /*
  703          * Without sched_lock we could lose a race where we set NEEDRESCHED
  704          * on a thread that is switched out before the IPI is delivered.  This
  705          * would lead us to miss the resched.  This will be a problem once
  706          * sched_lock is pushed down.
  707          */
  708         pcpu = pcpu_find(cpu);
  709         td = pcpu->pc_curthread;
  710         if (ke->ke_thread->td_priority < td->td_priority ||
  711             td == pcpu->pc_idlethread) {
  712                 td->td_flags |= TDF_NEEDRESCHED;
  713                 ipi_selected(1 << cpu, IPI_AST);
  714         }
  715 }
  716 
  717 static struct kse *
  718 runq_steal(struct runq *rq)
  719 {
  720         struct rqhead *rqh;
  721         struct rqbits *rqb;
  722         struct kse *ke;
  723         int word;
  724         int bit;
  725 
  726         mtx_assert(&sched_lock, MA_OWNED);
  727         rqb = &rq->rq_status;
  728         for (word = 0; word < RQB_LEN; word++) {
  729                 if (rqb->rqb_bits[word] == 0)
  730                         continue;
  731                 for (bit = 0; bit < RQB_BPW; bit++) {
  732                         if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
  733                                 continue;
  734                         rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
  735                         TAILQ_FOREACH(ke, rqh, ke_procq) {
  736                                 if (KSE_CAN_MIGRATE(ke))
  737                                         return (ke);
  738                         }
  739                 }
  740         }
  741         return (NULL);
  742 }
  743 
  744 static struct kse *
  745 kseq_steal(struct kseq *kseq, int stealidle)
  746 {
  747         struct kse *ke;
  748 
  749         /*
  750          * Steal from next first to try to get a non-interactive task that
  751          * may not have run for a while.
  752          */
  753         if ((ke = runq_steal(kseq->ksq_next)) != NULL)
  754                 return (ke);
  755         if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
  756                 return (ke);
  757         if (stealidle)
  758                 return (runq_steal(&kseq->ksq_idle));
  759         return (NULL);
  760 }
  761 
  762 int
  763 kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
  764 {
  765         struct kseq_group *nksg;
  766         struct kseq_group *ksg;
  767         struct kseq *old;
  768         int cpu;
  769         int idx;
  770 
  771         if (smp_started == 0)
  772                 return (0);
  773         cpu = 0;
  774         /*
  775          * If our load exceeds a certain threshold we should attempt to
  776          * reassign this thread.  The first candidate is the cpu that
  777          * originally ran the thread.  If it is idle, assign it there, 
  778          * otherwise, pick an idle cpu.
  779          *
  780          * The threshold at which we start to reassign kses has a large impact
  781          * on the overall performance of the system.  Tuned too high and
  782          * some CPUs may idle.  Too low and there will be excess migration
  783          * and context switches.
  784          */
  785         old = KSEQ_CPU(ke->ke_cpu);
  786         nksg = old->ksq_group;
  787         ksg = kseq->ksq_group;
  788         if (kseq_idle) {
  789                 if (kseq_idle & nksg->ksg_mask) {
  790                         cpu = ffs(nksg->ksg_idlemask);
  791                         if (cpu) {
  792                                 CTR2(KTR_SCHED,
  793                                     "kseq_transfer: %p found old cpu %X " 
  794                                     "in idlemask.", ke, cpu);
  795                                 goto migrate;
  796                         }
  797                 }
  798                 /*
  799                  * Multiple cpus could find this bit simultaneously
  800                  * but the race shouldn't be terrible.
  801                  */
  802                 cpu = ffs(kseq_idle);
  803                 if (cpu) {
  804                         CTR2(KTR_SCHED, "kseq_transfer: %p found %X " 
  805                             "in idlemask.", ke, cpu);
  806                         goto migrate;
  807                 }
  808         }
  809         idx = 0;
  810 #if 0
  811         if (old->ksq_load < kseq->ksq_load) {
  812                 cpu = ke->ke_cpu + 1;
  813                 CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X " 
  814                     "load less than ours.", ke, cpu);
  815                 goto migrate;
  816         }
  817         /*
  818          * No new CPU was found, look for one with less load.
  819          */
  820         for (idx = 0; idx <= ksg_maxid; idx++) {
  821                 nksg = KSEQ_GROUP(idx);
  822                 if (nksg->ksg_load /*+ (nksg->ksg_cpus  * 2)*/ < ksg->ksg_load) {
  823                         cpu = ffs(nksg->ksg_cpumask);
  824                         CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less " 
  825                             "than ours.", ke, cpu);
  826                         goto migrate;
  827                 }
  828         }
  829 #endif
  830         /*
  831          * If another cpu in this group has idled, assign a thread over
  832          * to them after checking to see if there are idled groups.
  833          */
  834         if (ksg->ksg_idlemask) {
  835                 cpu = ffs(ksg->ksg_idlemask);
  836                 if (cpu) {
  837                         CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in " 
  838                             "group.", ke, cpu);
  839                         goto migrate;
  840                 }
  841         }
  842         return (0);
  843 migrate:
  844         /*
  845          * Now that we've found an idle CPU, migrate the thread.
  846          */
  847         cpu--;
  848         ke->ke_runq = NULL;
  849         kseq_notify(ke, cpu);
  850 
  851         return (1);
  852 }
  853 
  854 #endif  /* SMP */
  855 
  856 /*
  857  * Pick the highest priority task we have and return it.
  858  */
  859 
  860 static struct kse *
  861 kseq_choose(struct kseq *kseq)
  862 {
  863         struct runq *swap;
  864         struct kse *ke;
  865         int nice;
  866 
  867         mtx_assert(&sched_lock, MA_OWNED);
  868         swap = NULL;
  869 
  870         for (;;) {
  871                 ke = runq_choose(kseq->ksq_curr);
  872                 if (ke == NULL) {
  873                         /*
  874                          * We already swapped once and didn't get anywhere.
  875                          */
  876                         if (swap)
  877                                 break;
  878                         swap = kseq->ksq_curr;
  879                         kseq->ksq_curr = kseq->ksq_next;
  880                         kseq->ksq_next = swap;
  881                         continue;
  882                 }
  883                 /*
  884                  * If we encounter a slice of 0 the kse is in a
  885                  * TIMESHARE kse group and its nice was too far out
  886                  * of the range that receives slices. 
  887                  */
  888                 nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin);
  889 #if 0
  890                 if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
  891                     ke->ke_proc->p_nice != 0)) {
  892                         runq_remove(ke->ke_runq, ke);
  893                         sched_slice(ke);
  894                         ke->ke_runq = kseq->ksq_next;
  895                         runq_add(ke->ke_runq, ke, 0);
  896                         continue;
  897                 }
  898 #endif
  899                 return (ke);
  900         }
  901 
  902         return (runq_choose(&kseq->ksq_idle));
  903 }
  904 
  905 static void
  906 kseq_setup(struct kseq *kseq)
  907 {
  908         runq_init(&kseq->ksq_timeshare[0]);
  909         runq_init(&kseq->ksq_timeshare[1]);
  910         runq_init(&kseq->ksq_idle);
  911         kseq->ksq_curr = &kseq->ksq_timeshare[0];
  912         kseq->ksq_next = &kseq->ksq_timeshare[1];
  913         kseq->ksq_load = 0;
  914         kseq->ksq_load_timeshare = 0;
  915 }
  916 
  917 static void
  918 sched_setup(void *dummy)
  919 {
  920 #ifdef SMP
  921         int i;
  922 #endif
  923 
  924         slice_min = (hz/100);   /* 10ms */
  925         slice_max = (hz/7);     /* ~140ms */
  926 
  927 #ifdef SMP
  928         balance_groups = 0;
  929         /*
  930          * Initialize the kseqs.
  931          */
  932         for (i = 0; i < MAXCPU; i++) {
  933                 struct kseq *ksq;
  934 
  935                 ksq = &kseq_cpu[i];
  936                 ksq->ksq_assigned = NULL;
  937                 kseq_setup(&kseq_cpu[i]);
  938         }
  939         if (smp_topology == NULL) {
  940                 struct kseq_group *ksg;
  941                 struct kseq *ksq;
  942                 int cpus;
  943 
  944                 for (cpus = 0, i = 0; i < MAXCPU; i++) {
  945                         if (CPU_ABSENT(i))
  946                                 continue;
  947                         ksq = &kseq_cpu[cpus];
  948                         ksg = &kseq_groups[cpus];
  949                         /*
  950                          * Setup a kseq group with one member.
  951                          */
  952                         ksq->ksq_transferable = 0;
  953                         ksq->ksq_group = ksg;
  954                         ksg->ksg_cpus = 1;
  955                         ksg->ksg_idlemask = 0;
  956                         ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
  957                         ksg->ksg_load = 0;
  958                         ksg->ksg_transferable = 0;
  959                         LIST_INIT(&ksg->ksg_members);
  960                         LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
  961                         cpus++;
  962                 }
  963                 ksg_maxid = cpus - 1;
  964         } else {
  965                 struct kseq_group *ksg;
  966                 struct cpu_group *cg;
  967                 int j;
  968 
  969                 for (i = 0; i < smp_topology->ct_count; i++) {
  970                         cg = &smp_topology->ct_group[i];
  971                         ksg = &kseq_groups[i];
  972                         /*
  973                          * Initialize the group.
  974                          */
  975                         ksg->ksg_idlemask = 0;
  976                         ksg->ksg_load = 0;
  977                         ksg->ksg_transferable = 0;
  978                         ksg->ksg_cpus = cg->cg_count;
  979                         ksg->ksg_cpumask = cg->cg_mask;
  980                         LIST_INIT(&ksg->ksg_members);
  981                         /*
  982                          * Find all of the group members and add them.
  983                          */
  984                         for (j = 0; j < MAXCPU; j++) {
  985                                 if ((cg->cg_mask & (1 << j)) != 0) {
  986                                         if (ksg->ksg_mask == 0)
  987                                                 ksg->ksg_mask = 1 << j;
  988                                         kseq_cpu[j].ksq_transferable = 0;
  989                                         kseq_cpu[j].ksq_group = ksg;
  990                                         LIST_INSERT_HEAD(&ksg->ksg_members,
  991                                             &kseq_cpu[j], ksq_siblings);
  992                                 }
  993                         }
  994                         if (ksg->ksg_cpus > 1)
  995                                 balance_groups = 1;
  996                 }
  997                 ksg_maxid = smp_topology->ct_count - 1;
  998         }
  999         /*
 1000          * Stagger the group and global load balancer so they do not
 1001          * interfere with each other.
 1002          */
 1003         bal_tick = ticks + hz;
 1004         if (balance_groups)
 1005                 gbal_tick = ticks + (hz / 2);
 1006 #else
 1007         kseq_setup(KSEQ_SELF());
 1008 #endif
 1009         mtx_lock_spin(&sched_lock);
 1010         kseq_load_add(KSEQ_SELF(), &kse0);
 1011         mtx_unlock_spin(&sched_lock);
 1012 }
 1013 
 1014 /*
 1015  * Scale the scheduling priority according to the "interactivity" of this
 1016  * process.
 1017  */
 1018 static void
 1019 sched_priority(struct ksegrp *kg)
 1020 {
 1021         int pri;
 1022 
 1023         if (kg->kg_pri_class != PRI_TIMESHARE)
 1024                 return;
 1025 
 1026         pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
 1027         pri += SCHED_PRI_BASE;
 1028         pri += kg->kg_proc->p_nice;
 1029 
 1030         if (pri > PRI_MAX_TIMESHARE)
 1031                 pri = PRI_MAX_TIMESHARE;
 1032         else if (pri < PRI_MIN_TIMESHARE)
 1033                 pri = PRI_MIN_TIMESHARE;
 1034 
 1035         kg->kg_user_pri = pri;
 1036 
 1037         return;
 1038 }
 1039 
 1040 /*
 1041  * Calculate a time slice based on the properties of the kseg and the runq
 1042  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
 1043  */
 1044 static void
 1045 sched_slice(struct kse *ke)
 1046 {
 1047         struct kseq *kseq;
 1048         struct ksegrp *kg;
 1049 
 1050         kg = ke->ke_ksegrp;
 1051         kseq = KSEQ_CPU(ke->ke_cpu);
 1052 
 1053         if (ke->ke_thread->td_flags & TDF_BORROWING) {
 1054                 ke->ke_slice = SCHED_SLICE_MIN;
 1055                 return;
 1056         }
 1057 
 1058         /*
 1059          * Rationale:
 1060          * KSEs in interactive ksegs get a minimal slice so that we
 1061          * quickly notice if it abuses its advantage.
 1062          *
 1063          * KSEs in non-interactive ksegs are assigned a slice that is
 1064          * based on the ksegs nice value relative to the least nice kseg
 1065          * on the run queue for this cpu.
 1066          *
 1067          * If the KSE is less nice than all others it gets the maximum
 1068          * slice and other KSEs will adjust their slice relative to
 1069          * this when they first expire.
 1070          *
 1071          * There is 20 point window that starts relative to the least
 1072          * nice kse on the run queue.  Slice size is determined by
 1073          * the kse distance from the last nice ksegrp.
 1074          *
 1075          * If the kse is outside of the window it will get no slice
 1076          * and will be reevaluated each time it is selected on the
 1077          * run queue.  The exception to this is nice 0 ksegs when
 1078          * a nice -20 is running.  They are always granted a minimum
 1079          * slice.
 1080          */
 1081         if (!SCHED_INTERACTIVE(kg)) {
 1082                 int nice;
 1083 
 1084                 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
 1085                 if (kseq->ksq_load_timeshare == 0 ||
 1086                     kg->kg_proc->p_nice < kseq->ksq_nicemin)
 1087                         ke->ke_slice = SCHED_SLICE_MAX;
 1088                 else if (nice <= SCHED_SLICE_NTHRESH)
 1089                         ke->ke_slice = SCHED_SLICE_NICE(nice);
 1090                 else if (kg->kg_proc->p_nice == 0)
 1091                         ke->ke_slice = SCHED_SLICE_MIN;
 1092                 else
 1093                         ke->ke_slice = SCHED_SLICE_MIN; /* 0 */
 1094         } else
 1095                 ke->ke_slice = SCHED_SLICE_INTERACTIVE;
 1096 
 1097         return;
 1098 }
 1099 
 1100 /*
 1101  * This routine enforces a maximum limit on the amount of scheduling history
 1102  * kept.  It is called after either the slptime or runtime is adjusted.
 1103  * This routine will not operate correctly when slp or run times have been
 1104  * adjusted to more than double their maximum.
 1105  */
 1106 static void
 1107 sched_interact_update(struct ksegrp *kg)
 1108 {
 1109         int sum;
 1110 
 1111         sum = kg->kg_runtime + kg->kg_slptime;
 1112         if (sum < SCHED_SLP_RUN_MAX)
 1113                 return;
 1114         /*
 1115          * If we have exceeded by more than 1/5th then the algorithm below
 1116          * will not bring us back into range.  Dividing by two here forces
 1117          * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 1118          */
 1119         if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 1120                 kg->kg_runtime /= 2;
 1121                 kg->kg_slptime /= 2;
 1122                 return;
 1123         }
 1124         kg->kg_runtime = (kg->kg_runtime / 5) * 4;
 1125         kg->kg_slptime = (kg->kg_slptime / 5) * 4;
 1126 }
 1127 
 1128 static void
 1129 sched_interact_fork(struct ksegrp *kg)
 1130 {
 1131         int ratio;
 1132         int sum;
 1133 
 1134         sum = kg->kg_runtime + kg->kg_slptime;
 1135         if (sum > SCHED_SLP_RUN_FORK) {
 1136                 ratio = sum / SCHED_SLP_RUN_FORK;
 1137                 kg->kg_runtime /= ratio;
 1138                 kg->kg_slptime /= ratio;
 1139         }
 1140 }
 1141 
 1142 static int
 1143 sched_interact_score(struct ksegrp *kg)
 1144 {
 1145         int div;
 1146 
 1147         if (kg->kg_runtime > kg->kg_slptime) {
 1148                 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
 1149                 return (SCHED_INTERACT_HALF +
 1150                     (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
 1151         } if (kg->kg_slptime > kg->kg_runtime) {
 1152                 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
 1153                 return (kg->kg_runtime / div);
 1154         }
 1155 
 1156         /*
 1157          * This can happen if slptime and runtime are 0.
 1158          */
 1159         return (0);
 1160 
 1161 }
 1162 
 1163 /*
 1164  * Very early in the boot some setup of scheduler-specific
 1165  * parts of proc0 and of soem scheduler resources needs to be done.
 1166  * Called from:
 1167  *  proc0_init()
 1168  */
 1169 void
 1170 schedinit(void)
 1171 {
 1172         /*
 1173          * Set up the scheduler specific parts of proc0.
 1174          */
 1175         proc0.p_sched = NULL; /* XXX */
 1176         ksegrp0.kg_sched = &kg_sched0;
 1177         thread0.td_sched = &kse0;
 1178         kse0.ke_thread = &thread0;
 1179         kse0.ke_state = KES_THREAD;
 1180         kg_sched0.skg_concurrency = 1;
 1181         kg_sched0.skg_avail_opennings = 0; /* we are already running */
 1182 }
 1183 
 1184 /*
 1185  * This is only somewhat accurate since given many processes of the same
 1186  * priority they will switch when their slices run out, which will be
 1187  * at most SCHED_SLICE_MAX.
 1188  */
 1189 int
 1190 sched_rr_interval(void)
 1191 {
 1192         return (SCHED_SLICE_MAX);
 1193 }
 1194 
 1195 static void
 1196 sched_pctcpu_update(struct kse *ke)
 1197 {
 1198         /*
 1199          * Adjust counters and watermark for pctcpu calc.
 1200          */
 1201         if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
 1202                 /*
 1203                  * Shift the tick count out so that the divide doesn't
 1204                  * round away our results.
 1205                  */
 1206                 ke->ke_ticks <<= 10;
 1207                 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
 1208                             SCHED_CPU_TICKS;
 1209                 ke->ke_ticks >>= 10;
 1210         } else
 1211                 ke->ke_ticks = 0;
 1212         ke->ke_ltick = ticks;
 1213         ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 1214 }
 1215 
 1216 void
 1217 sched_thread_priority(struct thread *td, u_char prio)
 1218 {
 1219         struct kse *ke;
 1220 
 1221         CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
 1222             td, td->td_proc->p_comm, td->td_priority, prio, curthread,
 1223             curthread->td_proc->p_comm);
 1224         ke = td->td_kse;
 1225         mtx_assert(&sched_lock, MA_OWNED);
 1226         if (td->td_priority == prio)
 1227                 return;
 1228         if (TD_ON_RUNQ(td)) {
 1229                 /*
 1230                  * If the priority has been elevated due to priority
 1231                  * propagation, we may have to move ourselves to a new
 1232                  * queue.  We still call adjustrunqueue below in case kse
 1233                  * needs to fix things up.
 1234                  */
 1235                 if (prio < td->td_priority && ke->ke_runq != NULL &&
 1236                     (ke->ke_flags & KEF_ASSIGNED) == 0 &&
 1237                     ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
 1238                         runq_remove(ke->ke_runq, ke);
 1239                         ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
 1240                         runq_add(ke->ke_runq, ke, 0);
 1241                 }
 1242                 /*
 1243                  * Hold this kse on this cpu so that sched_prio() doesn't
 1244                  * cause excessive migration.  We only want migration to
 1245                  * happen as the result of a wakeup.
 1246                  */
 1247                 ke->ke_flags |= KEF_HOLD;
 1248                 adjustrunqueue(td, prio);
 1249                 ke->ke_flags &= ~KEF_HOLD;
 1250         } else
 1251                 td->td_priority = prio;
 1252 }
 1253 
 1254 /*
 1255  * Update a thread's priority when it is lent another thread's
 1256  * priority.
 1257  */
 1258 void
 1259 sched_lend_prio(struct thread *td, u_char prio)
 1260 {
 1261 
 1262         td->td_flags |= TDF_BORROWING;
 1263         sched_thread_priority(td, prio);
 1264 }
 1265 
 1266 /*
 1267  * Restore a thread's priority when priority propagation is
 1268  * over.  The prio argument is the minimum priority the thread
 1269  * needs to have to satisfy other possible priority lending
 1270  * requests.  If the thread's regular priority is less
 1271  * important than prio, the thread will keep a priority boost
 1272  * of prio.
 1273  */
 1274 void
 1275 sched_unlend_prio(struct thread *td, u_char prio)
 1276 {
 1277         u_char base_pri;
 1278 
 1279         if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 1280             td->td_base_pri <= PRI_MAX_TIMESHARE)
 1281                 base_pri = td->td_ksegrp->kg_user_pri;
 1282         else
 1283                 base_pri = td->td_base_pri;
 1284         if (prio >= base_pri) {
 1285                 td->td_flags &= ~TDF_BORROWING;
 1286                 sched_thread_priority(td, base_pri);
 1287         } else
 1288                 sched_lend_prio(td, prio);
 1289 }
 1290 
 1291 void
 1292 sched_prio(struct thread *td, u_char prio)
 1293 {
 1294         u_char oldprio;
 1295 
 1296         /* First, update the base priority. */
 1297         td->td_base_pri = prio;
 1298 
 1299         /*
 1300          * If the thread is borrowing another thread's priority, don't
 1301          * ever lower the priority.
 1302          */
 1303         if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 1304                 return;
 1305 
 1306         /* Change the real priority. */
 1307         oldprio = td->td_priority;
 1308         sched_thread_priority(td, prio);
 1309 
 1310         /*
 1311          * If the thread is on a turnstile, then let the turnstile update
 1312          * its state.
 1313          */
 1314         if (TD_ON_LOCK(td) && oldprio != prio)
 1315                 turnstile_adjust(td, oldprio);
 1316 }
 1317 
 1318 void
 1319 sched_switch(struct thread *td, struct thread *newtd, int flags)
 1320 {
 1321         struct kseq *ksq;
 1322         struct kse *ke;
 1323 
 1324         mtx_assert(&sched_lock, MA_OWNED);
 1325 
 1326         ke = td->td_kse;
 1327         ksq = KSEQ_SELF();
 1328 
 1329         td->td_lastcpu = td->td_oncpu;
 1330         td->td_oncpu = NOCPU;
 1331         td->td_flags &= ~TDF_NEEDRESCHED;
 1332         td->td_owepreempt = 0;
 1333 
 1334         /*
 1335          * If the KSE has been assigned it may be in the process of switching
 1336          * to the new cpu.  This is the case in sched_bind().
 1337          */
 1338         if (td == PCPU_GET(idlethread)) {
 1339                 TD_SET_CAN_RUN(td);
 1340         } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
 1341                 /* We are ending our run so make our slot available again */
 1342                 SLOT_RELEASE(td->td_ksegrp);
 1343                 kseq_load_rem(ksq, ke);
 1344                 if (TD_IS_RUNNING(td)) {
 1345                         /*
 1346                          * Don't allow the thread to migrate
 1347                          * from a preemption.
 1348                          */
 1349                         ke->ke_flags |= KEF_HOLD;
 1350                         setrunqueue(td, (flags & SW_PREEMPT) ?
 1351                             SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 1352                             SRQ_OURSELF|SRQ_YIELDING);
 1353                         ke->ke_flags &= ~KEF_HOLD;
 1354                 } else if ((td->td_proc->p_flag & P_HADTHREADS) &&
 1355                     (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp))
 1356                         /*
 1357                          * We will not be on the run queue.
 1358                          * So we must be sleeping or similar.
 1359                          * Don't use the slot if we will need it 
 1360                          * for newtd.
 1361                          */
 1362                         slot_fill(td->td_ksegrp);
 1363         }
 1364         if (newtd != NULL) {
 1365                 /*
 1366                  * If we bring in a thread account for it as if it had been
 1367                  * added to the run queue and then chosen.
 1368                  */
 1369                 newtd->td_kse->ke_flags |= KEF_DIDRUN;
 1370                 newtd->td_kse->ke_runq = ksq->ksq_curr;
 1371                 TD_SET_RUNNING(newtd);
 1372                 kseq_load_add(KSEQ_SELF(), newtd->td_kse);
 1373                 /*
 1374                  * XXX When we preempt, we've already consumed a slot because
 1375                  * we got here through sched_add().  However, newtd can come
 1376                  * from thread_switchout() which can't SLOT_USE() because
 1377                  * the SLOT code is scheduler dependent.  We must use the
 1378                  * slot here otherwise.
 1379                  */
 1380                 if ((flags & SW_PREEMPT) == 0)
 1381                         SLOT_USE(newtd->td_ksegrp);
 1382         } else
 1383                 newtd = choosethread();
 1384         if (td != newtd) {
 1385 #ifdef  HWPMC_HOOKS
 1386                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 1387                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 1388 #endif
 1389                 cpu_switch(td, newtd);
 1390 #ifdef  HWPMC_HOOKS
 1391                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 1392                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 1393 #endif
 1394         }
 1395 
 1396         sched_lock.mtx_lock = (uintptr_t)td;
 1397 
 1398         td->td_oncpu = PCPU_GET(cpuid);
 1399 }
 1400 
 1401 void
 1402 sched_nice(struct proc *p, int nice)
 1403 {
 1404         struct ksegrp *kg;
 1405         struct kse *ke;
 1406         struct thread *td;
 1407         struct kseq *kseq;
 1408 
 1409         PROC_LOCK_ASSERT(p, MA_OWNED);
 1410         mtx_assert(&sched_lock, MA_OWNED);
 1411         /*
 1412          * We need to adjust the nice counts for running KSEs.
 1413          */
 1414         FOREACH_KSEGRP_IN_PROC(p, kg) {
 1415                 if (kg->kg_pri_class == PRI_TIMESHARE) {
 1416                         FOREACH_THREAD_IN_GROUP(kg, td) {
 1417                                 ke = td->td_kse;
 1418                                 if (ke->ke_runq == NULL)
 1419                                         continue;
 1420                                 kseq = KSEQ_CPU(ke->ke_cpu);
 1421                                 kseq_nice_rem(kseq, p->p_nice);
 1422                                 kseq_nice_add(kseq, nice);
 1423                         }
 1424                 }
 1425         }
 1426         p->p_nice = nice;
 1427         FOREACH_KSEGRP_IN_PROC(p, kg) {
 1428                 sched_priority(kg);
 1429                 FOREACH_THREAD_IN_GROUP(kg, td)
 1430                         td->td_flags |= TDF_NEEDRESCHED;
 1431         }
 1432 }
 1433 
 1434 void
 1435 sched_sleep(struct thread *td)
 1436 {
 1437         mtx_assert(&sched_lock, MA_OWNED);
 1438 
 1439         td->td_slptime = ticks;
 1440 }
 1441 
 1442 void
 1443 sched_wakeup(struct thread *td)
 1444 {
 1445         mtx_assert(&sched_lock, MA_OWNED);
 1446 
 1447         /*
 1448          * Let the kseg know how long we slept for.  This is because process
 1449          * interactivity behavior is modeled in the kseg.
 1450          */
 1451         if (td->td_slptime) {
 1452                 struct ksegrp *kg;
 1453                 int hzticks;
 1454 
 1455                 kg = td->td_ksegrp;
 1456                 hzticks = (ticks - td->td_slptime) << 10;
 1457                 if (hzticks >= SCHED_SLP_RUN_MAX) {
 1458                         kg->kg_slptime = SCHED_SLP_RUN_MAX;
 1459                         kg->kg_runtime = 1;
 1460                 } else {
 1461                         kg->kg_slptime += hzticks;
 1462                         sched_interact_update(kg);
 1463                 }
 1464                 sched_priority(kg);
 1465                 sched_slice(td->td_kse);
 1466                 td->td_slptime = 0;
 1467         }
 1468         setrunqueue(td, SRQ_BORING);
 1469 }
 1470 
 1471 /*
 1472  * Penalize the parent for creating a new child and initialize the child's
 1473  * priority.
 1474  */
 1475 void
 1476 sched_fork(struct thread *td, struct thread *childtd)
 1477 {
 1478 
 1479         mtx_assert(&sched_lock, MA_OWNED);
 1480 
 1481         sched_fork_ksegrp(td, childtd->td_ksegrp);
 1482         sched_fork_thread(td, childtd);
 1483 }
 1484 
 1485 void
 1486 sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
 1487 {
 1488         struct ksegrp *kg = td->td_ksegrp;
 1489         mtx_assert(&sched_lock, MA_OWNED);
 1490 
 1491         child->kg_slptime = kg->kg_slptime;
 1492         child->kg_runtime = kg->kg_runtime;
 1493         child->kg_user_pri = kg->kg_user_pri;
 1494         sched_interact_fork(child);
 1495         kg->kg_runtime += tickincr << 10;
 1496         sched_interact_update(kg);
 1497 }
 1498 
 1499 void
 1500 sched_fork_thread(struct thread *td, struct thread *child)
 1501 {
 1502         struct kse *ke;
 1503         struct kse *ke2;
 1504 
 1505         sched_newthread(child);
 1506         ke = td->td_kse;
 1507         ke2 = child->td_kse;
 1508         ke2->ke_slice = 1;      /* Attempt to quickly learn interactivity. */
 1509         ke2->ke_cpu = ke->ke_cpu;
 1510         ke2->ke_runq = NULL;
 1511 
 1512         /* Grab our parents cpu estimation information. */
 1513         ke2->ke_ticks = ke->ke_ticks;
 1514         ke2->ke_ltick = ke->ke_ltick;
 1515         ke2->ke_ftick = ke->ke_ftick;
 1516 }
 1517 
 1518 void
 1519 sched_class(struct ksegrp *kg, int class)
 1520 {
 1521         struct kseq *kseq;
 1522         struct kse *ke;
 1523         struct thread *td;
 1524         int nclass;
 1525         int oclass;
 1526 
 1527         mtx_assert(&sched_lock, MA_OWNED);
 1528         if (kg->kg_pri_class == class)
 1529                 return;
 1530 
 1531         nclass = PRI_BASE(class);
 1532         oclass = PRI_BASE(kg->kg_pri_class);
 1533         FOREACH_THREAD_IN_GROUP(kg, td) {
 1534                 ke = td->td_kse;
 1535                 if ((ke->ke_state != KES_ONRUNQ &&
 1536                     ke->ke_state != KES_THREAD) || ke->ke_runq == NULL)
 1537                         continue;
 1538                 kseq = KSEQ_CPU(ke->ke_cpu);
 1539 
 1540 #ifdef SMP
 1541                 /*
 1542                  * On SMP if we're on the RUNQ we must adjust the transferable
 1543                  * count because could be changing to or from an interrupt
 1544                  * class.
 1545                  */
 1546                 if (ke->ke_state == KES_ONRUNQ) {
 1547                         if (KSE_CAN_MIGRATE(ke)) {
 1548                                 kseq->ksq_transferable--;
 1549                                 kseq->ksq_group->ksg_transferable--;
 1550                         }
 1551                         if (KSE_CAN_MIGRATE(ke)) {
 1552                                 kseq->ksq_transferable++;
 1553                                 kseq->ksq_group->ksg_transferable++;
 1554                         }
 1555                 }
 1556 #endif
 1557                 if (oclass == PRI_TIMESHARE) {
 1558                         kseq->ksq_load_timeshare--;
 1559                         kseq_nice_rem(kseq, kg->kg_proc->p_nice);
 1560                 }
 1561                 if (nclass == PRI_TIMESHARE) {
 1562                         kseq->ksq_load_timeshare++;
 1563                         kseq_nice_add(kseq, kg->kg_proc->p_nice);
 1564                 }
 1565         }
 1566 
 1567         kg->kg_pri_class = class;
 1568 }
 1569 
 1570 /*
 1571  * Return some of the child's priority and interactivity to the parent.
 1572  */
 1573 void
 1574 sched_exit(struct proc *p, struct thread *childtd)
 1575 {
 1576         mtx_assert(&sched_lock, MA_OWNED);
 1577         sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
 1578         sched_exit_thread(NULL, childtd);
 1579 }
 1580 
 1581 void
 1582 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
 1583 {
 1584         /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
 1585         kg->kg_runtime += td->td_ksegrp->kg_runtime;
 1586         sched_interact_update(kg);
 1587 }
 1588 
 1589 void
 1590 sched_exit_thread(struct thread *td, struct thread *childtd)
 1591 {
 1592         CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
 1593             childtd, childtd->td_proc->p_comm, childtd->td_priority);
 1594         kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
 1595 }
 1596 
 1597 void
 1598 sched_clock(struct thread *td)
 1599 {
 1600         struct kseq *kseq;
 1601         struct ksegrp *kg;
 1602         struct kse *ke;
 1603 
 1604         mtx_assert(&sched_lock, MA_OWNED);
 1605         kseq = KSEQ_SELF();
 1606 #ifdef SMP
 1607         if (ticks >= bal_tick)
 1608                 sched_balance();
 1609         if (ticks >= gbal_tick && balance_groups)
 1610                 sched_balance_groups();
 1611         /*
 1612          * We could have been assigned a non real-time thread without an
 1613          * IPI.
 1614          */
 1615         if (kseq->ksq_assigned)
 1616                 kseq_assign(kseq);      /* Potentially sets NEEDRESCHED */
 1617 #endif
 1618         /*
 1619          * sched_setup() apparently happens prior to stathz being set.  We
 1620          * need to resolve the timers earlier in the boot so we can avoid
 1621          * calculating this here.
 1622          */
 1623         if (realstathz == 0) {
 1624                 realstathz = stathz ? stathz : hz;
 1625                 tickincr = hz / realstathz;
 1626                 /*
 1627                  * XXX This does not work for values of stathz that are much
 1628                  * larger than hz.
 1629                  */
 1630                 if (tickincr == 0)
 1631                         tickincr = 1;
 1632         }
 1633 
 1634         ke = td->td_kse;
 1635         kg = ke->ke_ksegrp;
 1636 
 1637         /* Adjust ticks for pctcpu */
 1638         ke->ke_ticks++;
 1639         ke->ke_ltick = ticks;
 1640 
 1641         /* Go up to one second beyond our max and then trim back down */
 1642         if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
 1643                 sched_pctcpu_update(ke);
 1644 
 1645         if (td->td_flags & TDF_IDLETD)
 1646                 return;
 1647         /*
 1648          * We only do slicing code for TIMESHARE ksegrps.
 1649          */
 1650         if (kg->kg_pri_class != PRI_TIMESHARE)
 1651                 return;
 1652         /*
 1653          * We used a tick charge it to the ksegrp so that we can compute our
 1654          * interactivity.
 1655          */
 1656         kg->kg_runtime += tickincr << 10;
 1657         sched_interact_update(kg);
 1658 
 1659         /*
 1660          * We used up one time slice.
 1661          */
 1662         if (--ke->ke_slice > 0)
 1663                 return;
 1664         /*
 1665          * We're out of time, recompute priorities and requeue.
 1666          */
 1667         kseq_load_rem(kseq, ke);
 1668         sched_priority(kg);
 1669         sched_slice(ke);
 1670         if (SCHED_CURR(kg, ke))
 1671                 ke->ke_runq = kseq->ksq_curr;
 1672         else
 1673                 ke->ke_runq = kseq->ksq_next;
 1674         kseq_load_add(kseq, ke);
 1675         td->td_flags |= TDF_NEEDRESCHED;
 1676 }
 1677 
 1678 int
 1679 sched_runnable(void)
 1680 {
 1681         struct kseq *kseq;
 1682         int load;
 1683 
 1684         load = 1;
 1685 
 1686         kseq = KSEQ_SELF();
 1687 #ifdef SMP
 1688         if (kseq->ksq_assigned) {
 1689                 mtx_lock_spin(&sched_lock);
 1690                 kseq_assign(kseq);
 1691                 mtx_unlock_spin(&sched_lock);
 1692         }
 1693 #endif
 1694         if ((curthread->td_flags & TDF_IDLETD) != 0) {
 1695                 if (kseq->ksq_load > 0)
 1696                         goto out;
 1697         } else
 1698                 if (kseq->ksq_load - 1 > 0)
 1699                         goto out;
 1700         load = 0;
 1701 out:
 1702         return (load);
 1703 }
 1704 
 1705 void
 1706 sched_userret(struct thread *td)
 1707 {
 1708         struct ksegrp *kg;
 1709 
 1710         KASSERT((td->td_flags & TDF_BORROWING) == 0,
 1711             ("thread with borrowed priority returning to userland"));
 1712         kg = td->td_ksegrp;     
 1713         if (td->td_priority != kg->kg_user_pri) {
 1714                 mtx_lock_spin(&sched_lock);
 1715                 td->td_priority = kg->kg_user_pri;
 1716                 td->td_base_pri = kg->kg_user_pri;
 1717                 mtx_unlock_spin(&sched_lock);
 1718         }
 1719 }
 1720 
 1721 struct kse *
 1722 sched_choose(void)
 1723 {
 1724         struct kseq *kseq;
 1725         struct kse *ke;
 1726 
 1727         mtx_assert(&sched_lock, MA_OWNED);
 1728         kseq = KSEQ_SELF();
 1729 #ifdef SMP
 1730 restart:
 1731         if (kseq->ksq_assigned)
 1732                 kseq_assign(kseq);
 1733 #endif
 1734         ke = kseq_choose(kseq);
 1735         if (ke) {
 1736 #ifdef SMP
 1737                 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
 1738                         if (kseq_idled(kseq) == 0)
 1739                                 goto restart;
 1740 #endif
 1741                 kseq_runq_rem(kseq, ke);
 1742                 ke->ke_state = KES_THREAD;
 1743                 ke->ke_flags &= ~KEF_PREEMPTED;
 1744                 return (ke);
 1745         }
 1746 #ifdef SMP
 1747         if (kseq_idled(kseq) == 0)
 1748                 goto restart;
 1749 #endif
 1750         return (NULL);
 1751 }
 1752 
 1753 void
 1754 sched_add(struct thread *td, int flags)
 1755 {
 1756         struct kseq *kseq;
 1757         struct ksegrp *kg;
 1758         struct kse *ke;
 1759         int preemptive;
 1760         int canmigrate;
 1761         int class;
 1762 
 1763         CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 1764             td, td->td_proc->p_comm, td->td_priority, curthread,
 1765             curthread->td_proc->p_comm);
 1766         mtx_assert(&sched_lock, MA_OWNED);
 1767         ke = td->td_kse;
 1768         kg = td->td_ksegrp;
 1769         canmigrate = 1;
 1770         preemptive = !(flags & SRQ_YIELDING);
 1771         class = PRI_BASE(kg->kg_pri_class);
 1772         kseq = KSEQ_SELF();
 1773         if ((ke->ke_flags & KEF_INTERNAL) == 0)
 1774                 SLOT_USE(td->td_ksegrp);
 1775         ke->ke_flags &= ~KEF_INTERNAL;
 1776 #ifdef SMP
 1777         if (ke->ke_flags & KEF_ASSIGNED) {
 1778                 if (ke->ke_flags & KEF_REMOVED)
 1779                         ke->ke_flags &= ~KEF_REMOVED;
 1780                 return;
 1781         }
 1782         canmigrate = KSE_CAN_MIGRATE(ke);
 1783         /*
 1784          * Don't migrate running threads here.  Force the long term balancer
 1785          * to do it.
 1786          */
 1787         if (ke->ke_flags & KEF_HOLD) {
 1788                 ke->ke_flags &= ~KEF_HOLD;
 1789                 canmigrate = 0;
 1790         }
 1791 #endif
 1792         KASSERT(ke->ke_state != KES_ONRUNQ,
 1793             ("sched_add: kse %p (%s) already in run queue", ke,
 1794             ke->ke_proc->p_comm));
 1795         KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 1796             ("sched_add: process swapped out"));
 1797         KASSERT(ke->ke_runq == NULL,
 1798             ("sched_add: KSE %p is still assigned to a run queue", ke));
 1799         if (flags & SRQ_PREEMPTED)
 1800                 ke->ke_flags |= KEF_PREEMPTED;
 1801         switch (class) {
 1802         case PRI_ITHD:
 1803         case PRI_REALTIME:
 1804                 ke->ke_runq = kseq->ksq_curr;
 1805                 ke->ke_slice = SCHED_SLICE_MAX;
 1806                 if (canmigrate)
 1807                         ke->ke_cpu = PCPU_GET(cpuid);
 1808                 break;
 1809         case PRI_TIMESHARE:
 1810                 if (SCHED_CURR(kg, ke))
 1811                         ke->ke_runq = kseq->ksq_curr;
 1812                 else
 1813                         ke->ke_runq = kseq->ksq_next;
 1814                 break;
 1815         case PRI_IDLE:
 1816                 /*
 1817                  * This is for priority prop.
 1818                  */
 1819                 if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
 1820                         ke->ke_runq = kseq->ksq_curr;
 1821                 else
 1822                         ke->ke_runq = &kseq->ksq_idle;
 1823                 ke->ke_slice = SCHED_SLICE_MIN;
 1824                 break;
 1825         default:
 1826                 panic("Unknown pri class.");
 1827                 break;
 1828         }
 1829 #ifdef SMP
 1830         /*
 1831          * If this thread is pinned or bound, notify the target cpu.
 1832          */
 1833         if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
 1834                 ke->ke_runq = NULL;
 1835                 kseq_notify(ke, ke->ke_cpu);
 1836                 return;
 1837         }
 1838         /*
 1839          * If we had been idle, clear our bit in the group and potentially
 1840          * the global bitmap.  If not, see if we should transfer this thread.
 1841          */
 1842         if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
 1843             (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
 1844                 /*
 1845                  * Check to see if our group is unidling, and if so, remove it
 1846                  * from the global idle mask.
 1847                  */
 1848                 if (kseq->ksq_group->ksg_idlemask ==
 1849                     kseq->ksq_group->ksg_cpumask)
 1850                         atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
 1851                 /*
 1852                  * Now remove ourselves from the group specific idle mask.
 1853                  */
 1854                 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
 1855         } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD)
 1856                 if (kseq_transfer(kseq, ke, class))
 1857                         return;
 1858         ke->ke_cpu = PCPU_GET(cpuid);
 1859 #endif
 1860         if (td->td_priority < curthread->td_priority &&
 1861             ke->ke_runq == kseq->ksq_curr)
 1862                 curthread->td_flags |= TDF_NEEDRESCHED;
 1863         if (preemptive && maybe_preempt(td))
 1864                 return;
 1865         ke->ke_state = KES_ONRUNQ;
 1866 
 1867         kseq_runq_add(kseq, ke, flags);
 1868         kseq_load_add(kseq, ke);
 1869 }
 1870 
 1871 void
 1872 sched_rem(struct thread *td)
 1873 {
 1874         struct kseq *kseq;
 1875         struct kse *ke;
 1876 
 1877         CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
 1878             td, td->td_proc->p_comm, td->td_priority, curthread,
 1879             curthread->td_proc->p_comm);
 1880         mtx_assert(&sched_lock, MA_OWNED);
 1881         ke = td->td_kse;
 1882         SLOT_RELEASE(td->td_ksegrp);
 1883         ke->ke_flags &= ~KEF_PREEMPTED;
 1884         if (ke->ke_flags & KEF_ASSIGNED) {
 1885                 ke->ke_flags |= KEF_REMOVED;
 1886                 return;
 1887         }
 1888         KASSERT((ke->ke_state == KES_ONRUNQ),
 1889             ("sched_rem: KSE not on run queue"));
 1890 
 1891         ke->ke_state = KES_THREAD;
 1892         kseq = KSEQ_CPU(ke->ke_cpu);
 1893         kseq_runq_rem(kseq, ke);
 1894         kseq_load_rem(kseq, ke);
 1895 }
 1896 
 1897 fixpt_t
 1898 sched_pctcpu(struct thread *td)
 1899 {
 1900         fixpt_t pctcpu;
 1901         struct kse *ke;
 1902 
 1903         pctcpu = 0;
 1904         ke = td->td_kse;
 1905         if (ke == NULL)
 1906                 return (0);
 1907 
 1908         mtx_lock_spin(&sched_lock);
 1909         if (ke->ke_ticks) {
 1910                 int rtick;
 1911 
 1912                 /*
 1913                  * Don't update more frequently than twice a second.  Allowing
 1914                  * this causes the cpu usage to decay away too quickly due to
 1915                  * rounding errors.
 1916                  */
 1917                 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
 1918                     ke->ke_ltick < (ticks - (hz / 2)))
 1919                         sched_pctcpu_update(ke);
 1920                 /* How many rtick per second ? */
 1921                 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
 1922                 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
 1923         }
 1924 
 1925         ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
 1926         mtx_unlock_spin(&sched_lock);
 1927 
 1928         return (pctcpu);
 1929 }
 1930 
 1931 void
 1932 sched_bind(struct thread *td, int cpu)
 1933 {
 1934         struct kse *ke;
 1935 
 1936         mtx_assert(&sched_lock, MA_OWNED);
 1937         ke = td->td_kse;
 1938         ke->ke_flags |= KEF_BOUND;
 1939 #ifdef SMP
 1940         if (PCPU_GET(cpuid) == cpu)
 1941                 return;
 1942         /* sched_rem without the runq_remove */
 1943         ke->ke_state = KES_THREAD;
 1944         kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 1945         kseq_notify(ke, cpu);
 1946         /* When we return from mi_switch we'll be on the correct cpu. */
 1947         mi_switch(SW_VOL, NULL);
 1948 #endif
 1949 }
 1950 
 1951 void
 1952 sched_unbind(struct thread *td)
 1953 {
 1954         mtx_assert(&sched_lock, MA_OWNED);
 1955         td->td_kse->ke_flags &= ~KEF_BOUND;
 1956 }
 1957 
 1958 int
 1959 sched_is_bound(struct thread *td)
 1960 {
 1961         mtx_assert(&sched_lock, MA_OWNED);
 1962         return (td->td_kse->ke_flags & KEF_BOUND);
 1963 }
 1964 
 1965 int
 1966 sched_load(void)
 1967 {
 1968 #ifdef SMP
 1969         int total;
 1970         int i;
 1971 
 1972         total = 0;
 1973         for (i = 0; i <= ksg_maxid; i++)
 1974                 total += KSEQ_GROUP(i)->ksg_load;
 1975         return (total);
 1976 #else
 1977         return (KSEQ_SELF()->ksq_sysload);
 1978 #endif
 1979 }
 1980 
 1981 int
 1982 sched_sizeof_ksegrp(void)
 1983 {
 1984         return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 1985 }
 1986 
 1987 int
 1988 sched_sizeof_proc(void)
 1989 {
 1990         return (sizeof(struct proc));
 1991 }
 1992 
 1993 int
 1994 sched_sizeof_thread(void)
 1995 {
 1996         return (sizeof(struct thread) + sizeof(struct td_sched));
 1997 }
 1998 #define KERN_SWITCH_INCLUDE 1
 1999 #include "kern/kern_switch.c"
Cache object: 696a69da3cddc47e4b3876f07bbcff7e
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/sched_ule.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/sched_ule.c