sched_ule.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice unmodified, this list of conditions, and the following
   10  *    disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/5.3/sys/kern/sched_ule.c 136957 2004-10-26 02:22:54Z scottl $");
   29 
   30 #include <opt_sched.h>
   31 
   32 #define kse td_sched
   33 
   34 #include <sys/param.h>
   35 #include <sys/systm.h>
   36 #include <sys/kdb.h>
   37 #include <sys/kernel.h>
   38 #include <sys/ktr.h>
   39 #include <sys/lock.h>
   40 #include <sys/mutex.h>
   41 #include <sys/proc.h>
   42 #include <sys/resource.h>
   43 #include <sys/resourcevar.h>
   44 #include <sys/sched.h>
   45 #include <sys/smp.h>
   46 #include <sys/sx.h>
   47 #include <sys/sysctl.h>
   48 #include <sys/sysproto.h>
   49 #include <sys/vmmeter.h>
   50 #ifdef KTRACE
   51 #include <sys/uio.h>
   52 #include <sys/ktrace.h>
   53 #endif
   54 
   55 #include <machine/cpu.h>
   56 #include <machine/smp.h>
   57 
   58 #define KTR_ULE KTR_NFS
   59 
   60 #error "The SCHED_ULE scheduler is broken.  Please use SCHED_4BSD"
   61 
   62 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
   63 /* XXX This is bogus compatability crap for ps */
   64 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
   65 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
   66 
   67 static void sched_setup(void *dummy);
   68 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
   69 
   70 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
   71 
   72 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
   73     "Scheduler name");
   74 
   75 static int slice_min = 1;
   76 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
   77 
   78 static int slice_max = 10;
   79 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
   80 
   81 int realstathz;
   82 int tickincr = 1;
   83 
   84 #ifdef PREEMPTION
   85 static void
   86 printf_caddr_t(void *data)
   87 {
   88         printf("%s", (char *)data);
   89 }
   90 static char preempt_warning[] =
   91     "WARNING: Kernel PREEMPTION is unstable under SCHED_ULE.\n"; 
   92 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
   93     preempt_warning)
   94 #endif
   95 
   96 /*
   97  * The schedulable entity that can be given a context to run.
   98  * A process may have several of these. Probably one per processor
   99  * but posibly a few more. In this universe they are grouped
  100  * with a KSEG that contains the priority and niceness
  101  * for the group.
  102  */
  103 struct kse {
  104         TAILQ_ENTRY(kse) ke_kglist;     /* (*) Queue of threads in ke_ksegrp. */
  105         TAILQ_ENTRY(kse) ke_kgrlist;    /* (*) Queue of threads in this state.*/
  106         TAILQ_ENTRY(kse) ke_procq;      /* (j/z) Run queue. */
  107         int             ke_flags;       /* (j) KEF_* flags. */
  108         struct thread   *ke_thread;     /* (*) Active associated thread. */
  109         fixpt_t         ke_pctcpu;      /* (j) %cpu during p_swtime. */
  110         u_char          ke_oncpu;       /* (j) Which cpu we are on. */
  111         char            ke_rqindex;     /* (j) Run queue index. */
  112         enum {
  113                 KES_THREAD = 0x0,       /* slaved to thread state */
  114                 KES_ONRUNQ
  115         } ke_state;                     /* (j) thread sched specific status. */
  116         int             ke_slptime;
  117         int             ke_slice;
  118         struct runq     *ke_runq;
  119         u_char          ke_cpu;         /* CPU that we have affinity for. */
  120         /* The following variables are only used for pctcpu calculation */
  121         int             ke_ltick;       /* Last tick that we were running on */
  122         int             ke_ftick;       /* First tick that we were running on */
  123         int             ke_ticks;       /* Tick count */
  124 
  125 };
  126 
  127 
  128 #define td_kse td_sched
  129 #define td_slptime              td_kse->ke_slptime
  130 #define ke_proc                 ke_thread->td_proc
  131 #define ke_ksegrp               ke_thread->td_ksegrp
  132 
  133 /* flags kept in ke_flags */
  134 #define KEF_SCHED0      0x00001 /* For scheduler-specific use. */
  135 #define KEF_SCHED1      0x00002 /* For scheduler-specific use. */
  136 #define KEF_SCHED2      0x00004 /* For scheduler-specific use. */
  137 #define KEF_SCHED3      0x00008 /* For scheduler-specific use. */
  138 #define KEF_DIDRUN      0x02000 /* Thread actually ran. */
  139 #define KEF_EXIT        0x04000 /* Thread is being killed. */
  140 
  141 /*
  142  * These datastructures are allocated within their parent datastructure but
  143  * are scheduler specific.
  144  */
  145 
  146 #define ke_assign       ke_procq.tqe_next
  147 
  148 #define KEF_ASSIGNED    KEF_SCHED0      /* Thread is being migrated. */
  149 #define KEF_BOUND       KEF_SCHED1      /* Thread can not migrate. */
  150 #define KEF_XFERABLE    KEF_SCHED2      /* Thread was added as transferable. */
  151 #define KEF_HOLD        KEF_SCHED3      /* Thread is temporarily bound. */
  152 
  153 struct kg_sched {
  154         struct thread   *skg_last_assigned; /* (j) Last thread assigned to */
  155                                            /* the system scheduler */
  156         int     skg_slptime;            /* Number of ticks we vol. slept */
  157         int     skg_runtime;            /* Number of ticks we were running */
  158         int     skg_avail_opennings;    /* (j) Num unfilled slots in group.*/
  159         int     skg_concurrency;        /* (j) Num threads requested in group.*/
  160         int     skg_runq_threads;       /* (j) Num KSEs on runq. */
  161 };
  162 #define kg_last_assigned        kg_sched->skg_last_assigned
  163 #define kg_avail_opennings      kg_sched->skg_avail_opennings
  164 #define kg_concurrency          kg_sched->skg_concurrency
  165 #define kg_runq_threads         kg_sched->skg_runq_threads
  166 #define kg_runtime              kg_sched->skg_runtime
  167 #define kg_slptime              kg_sched->skg_slptime
  168 
  169 #define SLOT_RELEASE(kg)                                                \
  170 do {                                                                    \
  171         kg->kg_avail_opennings++;                                       \
  172         CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)",                \
  173         kg,                                                             \
  174         kg->kg_concurrency,                                             \
  175          kg->kg_avail_opennings);                                       \
  176         /*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency),       \
  177             ("slots out of whack")); */                                 \
  178 } while (0)
  179 
  180 #define SLOT_USE(kg)                                                    \
  181 do {                                                                    \
  182         kg->kg_avail_opennings--;                                       \
  183         CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)",                    \
  184         kg,                                                             \
  185         kg->kg_concurrency,                                             \
  186          kg->kg_avail_opennings);                                       \
  187         /*KASSERT((kg->kg_avail_opennings >= 0),                        \
  188             ("slots out of whack"));*/                                  \
  189 } while (0)
  190 
  191 static struct kse kse0;
  192 static struct kg_sched kg_sched0;
  193 
  194 /*
  195  * The priority is primarily determined by the interactivity score.  Thus, we
  196  * give lower(better) priorities to kse groups that use less CPU.  The nice
  197  * value is then directly added to this to allow nice to have some effect
  198  * on latency.
  199  *
  200  * PRI_RANGE:   Total priority range for timeshare threads.
  201  * PRI_NRESV:   Number of nice values.
  202  * PRI_BASE:    The start of the dynamic range.
  203  */
  204 #define SCHED_PRI_RANGE         (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
  205 #define SCHED_PRI_NRESV         ((PRIO_MAX - PRIO_MIN) + 1)
  206 #define SCHED_PRI_NHALF         (SCHED_PRI_NRESV / 2)
  207 #define SCHED_PRI_BASE          (PRI_MIN_TIMESHARE)
  208 #define SCHED_PRI_INTERACT(score)                                       \
  209     ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
  210 
  211 /*
  212  * These determine the interactivity of a process.
  213  *
  214  * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
  215  *              before throttling back.
  216  * SLP_RUN_FORK:        Maximum slp+run time to inherit at fork time.
  217  * INTERACT_MAX:        Maximum interactivity value.  Smaller is better.
  218  * INTERACT_THRESH:     Threshhold for placement on the current runq.
  219  */
  220 #define SCHED_SLP_RUN_MAX       ((hz * 5) << 10)
  221 #define SCHED_SLP_RUN_FORK      ((hz / 2) << 10)
  222 #define SCHED_INTERACT_MAX      (100)
  223 #define SCHED_INTERACT_HALF     (SCHED_INTERACT_MAX / 2)
  224 #define SCHED_INTERACT_THRESH   (30)
  225 
  226 /*
  227  * These parameters and macros determine the size of the time slice that is
  228  * granted to each thread.
  229  *
  230  * SLICE_MIN:   Minimum time slice granted, in units of ticks.
  231  * SLICE_MAX:   Maximum time slice granted.
  232  * SLICE_RANGE: Range of available time slices scaled by hz.
  233  * SLICE_SCALE: The number slices granted per val in the range of [0, max].
  234  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
  235  * SLICE_NTHRESH:       The nice cutoff point for slice assignment.
  236  */
  237 #define SCHED_SLICE_MIN                 (slice_min)
  238 #define SCHED_SLICE_MAX                 (slice_max)
  239 #define SCHED_SLICE_INTERACTIVE         (slice_max)
  240 #define SCHED_SLICE_NTHRESH     (SCHED_PRI_NHALF - 1)
  241 #define SCHED_SLICE_RANGE               (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
  242 #define SCHED_SLICE_SCALE(val, max)     (((val) * SCHED_SLICE_RANGE) / (max))
  243 #define SCHED_SLICE_NICE(nice)                                          \
  244     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
  245 
  246 /*
  247  * This macro determines whether or not the thread belongs on the current or
  248  * next run queue.
  249  */
  250 #define SCHED_INTERACTIVE(kg)                                           \
  251     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
  252 #define SCHED_CURR(kg, ke)                                              \
  253     (ke->ke_thread->td_priority < kg->kg_user_pri ||                    \
  254     SCHED_INTERACTIVE(kg))
  255 
  256 /*
  257  * Cpu percentage computation macros and defines.
  258  *
  259  * SCHED_CPU_TIME:      Number of seconds to average the cpu usage across.
  260  * SCHED_CPU_TICKS:     Number of hz ticks to average the cpu usage across.
  261  */
  262 
  263 #define SCHED_CPU_TIME  10
  264 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
  265 
  266 /*
  267  * kseq - per processor runqs and statistics.
  268  */
  269 struct kseq {
  270         struct runq     ksq_idle;               /* Queue of IDLE threads. */
  271         struct runq     ksq_timeshare[2];       /* Run queues for !IDLE. */
  272         struct runq     *ksq_next;              /* Next timeshare queue. */
  273         struct runq     *ksq_curr;              /* Current queue. */
  274         int             ksq_load_timeshare;     /* Load for timeshare. */
  275         int             ksq_load;               /* Aggregate load. */
  276         short           ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
  277         short           ksq_nicemin;            /* Least nice. */
  278 #ifdef SMP
  279         int                     ksq_transferable;
  280         LIST_ENTRY(kseq)        ksq_siblings;   /* Next in kseq group. */
  281         struct kseq_group       *ksq_group;     /* Our processor group. */
  282         volatile struct kse     *ksq_assigned;  /* assigned by another CPU. */
  283 #else
  284         int             ksq_sysload;            /* For loadavg, !ITHD load. */
  285 #endif
  286 };
  287 
  288 #ifdef SMP
  289 /*
  290  * kseq groups are groups of processors which can cheaply share threads.  When
  291  * one processor in the group goes idle it will check the runqs of the other
  292  * processors in its group prior to halting and waiting for an interrupt.
  293  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
  294  * In a numa environment we'd want an idle bitmap per group and a two tiered
  295  * load balancer.
  296  */
  297 struct kseq_group {
  298         int     ksg_cpus;               /* Count of CPUs in this kseq group. */
  299         cpumask_t ksg_cpumask;          /* Mask of cpus in this group. */
  300         cpumask_t ksg_idlemask;         /* Idle cpus in this group. */
  301         cpumask_t ksg_mask;             /* Bit mask for first cpu. */
  302         int     ksg_load;               /* Total load of this group. */
  303         int     ksg_transferable;       /* Transferable load of this group. */
  304         LIST_HEAD(, kseq)       ksg_members; /* Linked list of all members. */
  305 };
  306 #endif
  307 
  308 /*
  309  * One kse queue per processor.
  310  */
  311 #ifdef SMP
  312 static cpumask_t kseq_idle;
  313 static int ksg_maxid;
  314 static struct kseq      kseq_cpu[MAXCPU];
  315 static struct kseq_group kseq_groups[MAXCPU];
  316 static int bal_tick;
  317 static int gbal_tick;
  318 
  319 #define KSEQ_SELF()     (&kseq_cpu[PCPU_GET(cpuid)])
  320 #define KSEQ_CPU(x)     (&kseq_cpu[(x)])
  321 #define KSEQ_ID(x)      ((x) - kseq_cpu)
  322 #define KSEQ_GROUP(x)   (&kseq_groups[(x)])
  323 #else   /* !SMP */
  324 static struct kseq      kseq_cpu;
  325 
  326 #define KSEQ_SELF()     (&kseq_cpu)
  327 #define KSEQ_CPU(x)     (&kseq_cpu)
  328 #endif
  329 
  330 static void     slot_fill(struct ksegrp *kg);
  331 static struct kse *sched_choose(void);          /* XXX Should be thread * */
  332 static void sched_add_internal(struct thread *td, int preemptive);
  333 static void sched_slice(struct kse *ke);
  334 static void sched_priority(struct ksegrp *kg);
  335 static int sched_interact_score(struct ksegrp *kg);
  336 static void sched_interact_update(struct ksegrp *kg);
  337 static void sched_interact_fork(struct ksegrp *kg);
  338 static void sched_pctcpu_update(struct kse *ke);
  339 
  340 /* Operations on per processor queues */
  341 static struct kse * kseq_choose(struct kseq *kseq);
  342 static void kseq_setup(struct kseq *kseq);
  343 static void kseq_load_add(struct kseq *kseq, struct kse *ke);
  344 static void kseq_load_rem(struct kseq *kseq, struct kse *ke);
  345 static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke);
  346 static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke);
  347 static void kseq_nice_add(struct kseq *kseq, int nice);
  348 static void kseq_nice_rem(struct kseq *kseq, int nice);
  349 void kseq_print(int cpu);
  350 #ifdef SMP
  351 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
  352 static struct kse *runq_steal(struct runq *rq);
  353 static void sched_balance(void);
  354 static void sched_balance_groups(void);
  355 static void sched_balance_group(struct kseq_group *ksg);
  356 static void sched_balance_pair(struct kseq *high, struct kseq *low);
  357 static void kseq_move(struct kseq *from, int cpu);
  358 static int kseq_idled(struct kseq *kseq);
  359 static void kseq_notify(struct kse *ke, int cpu);
  360 static void kseq_assign(struct kseq *);
  361 static struct kse *kseq_steal(struct kseq *kseq, int stealidle);
  362 /*
  363  * On P4 Xeons the round-robin interrupt delivery is broken.  As a result of
  364  * this, we can't pin interrupts to the cpu that they were delivered to, 
  365  * otherwise all ithreads only run on CPU 0.
  366  */
  367 #ifdef __i386__
  368 #define KSE_CAN_MIGRATE(ke, class)                                      \
  369     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
  370 #else /* !__i386__ */
  371 #define KSE_CAN_MIGRATE(ke, class)                                      \
  372     ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 &&          \
  373     ((ke)->ke_flags & KEF_BOUND) == 0)
  374 #endif /* !__i386__ */
  375 #endif
  376 
  377 void
  378 kseq_print(int cpu)
  379 {
  380         struct kseq *kseq;
  381         int i;
  382 
  383         kseq = KSEQ_CPU(cpu);
  384 
  385         printf("kseq:\n");
  386         printf("\tload:           %d\n", kseq->ksq_load);
  387         printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
  388 #ifdef SMP
  389         printf("\tload transferable: %d\n", kseq->ksq_transferable);
  390 #endif
  391         printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
  392         printf("\tnice counts:\n");
  393         for (i = 0; i < SCHED_PRI_NRESV; i++)
  394                 if (kseq->ksq_nice[i])
  395                         printf("\t\t%d = %d\n",
  396                             i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
  397 }
  398 
  399 static __inline void
  400 kseq_runq_add(struct kseq *kseq, struct kse *ke)
  401 {
  402 #ifdef SMP
  403         if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) {
  404                 kseq->ksq_transferable++;
  405                 kseq->ksq_group->ksg_transferable++;
  406                 ke->ke_flags |= KEF_XFERABLE;
  407         }
  408 #endif
  409         runq_add(ke->ke_runq, ke, 0);
  410 }
  411 
  412 static __inline void
  413 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
  414 {
  415 #ifdef SMP
  416         if (ke->ke_flags & KEF_XFERABLE) {
  417                 kseq->ksq_transferable--;
  418                 kseq->ksq_group->ksg_transferable--;
  419                 ke->ke_flags &= ~KEF_XFERABLE;
  420         }
  421 #endif
  422         runq_remove(ke->ke_runq, ke);
  423 }
  424 
  425 static void
  426 kseq_load_add(struct kseq *kseq, struct kse *ke)
  427 {
  428         int class;
  429         mtx_assert(&sched_lock, MA_OWNED);
  430         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
  431         if (class == PRI_TIMESHARE)
  432                 kseq->ksq_load_timeshare++;
  433         kseq->ksq_load++;
  434         if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
  435 #ifdef SMP
  436                 kseq->ksq_group->ksg_load++;
  437 #else
  438                 kseq->ksq_sysload++;
  439 #endif
  440         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
  441                 CTR6(KTR_ULE,
  442                     "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
  443                     ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
  444                     ke->ke_proc->p_nice, kseq->ksq_nicemin);
  445         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
  446                 kseq_nice_add(kseq, ke->ke_proc->p_nice);
  447 }
  448 
  449 static void
  450 kseq_load_rem(struct kseq *kseq, struct kse *ke)
  451 {
  452         int class;
  453         mtx_assert(&sched_lock, MA_OWNED);
  454         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
  455         if (class == PRI_TIMESHARE)
  456                 kseq->ksq_load_timeshare--;
  457         if (class != PRI_ITHD  && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
  458 #ifdef SMP
  459                 kseq->ksq_group->ksg_load--;
  460 #else
  461                 kseq->ksq_sysload--;
  462 #endif
  463         kseq->ksq_load--;
  464         ke->ke_runq = NULL;
  465         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
  466                 kseq_nice_rem(kseq, ke->ke_proc->p_nice);
  467 }
  468 
  469 static void
  470 kseq_nice_add(struct kseq *kseq, int nice)
  471 {
  472         mtx_assert(&sched_lock, MA_OWNED);
  473         /* Normalize to zero. */
  474         kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
  475         if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
  476                 kseq->ksq_nicemin = nice;
  477 }
  478 
  479 static void
  480 kseq_nice_rem(struct kseq *kseq, int nice) 
  481 {
  482         int n;
  483 
  484         mtx_assert(&sched_lock, MA_OWNED);
  485         /* Normalize to zero. */
  486         n = nice + SCHED_PRI_NHALF;
  487         kseq->ksq_nice[n]--;
  488         KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
  489 
  490         /*
  491          * If this wasn't the smallest nice value or there are more in
  492          * this bucket we can just return.  Otherwise we have to recalculate
  493          * the smallest nice.
  494          */
  495         if (nice != kseq->ksq_nicemin ||
  496             kseq->ksq_nice[n] != 0 ||
  497             kseq->ksq_load_timeshare == 0)
  498                 return;
  499 
  500         for (; n < SCHED_PRI_NRESV; n++)
  501                 if (kseq->ksq_nice[n]) {
  502                         kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
  503                         return;
  504                 }
  505 }
  506 
  507 #ifdef SMP
  508 /*
  509  * sched_balance is a simple CPU load balancing algorithm.  It operates by
  510  * finding the least loaded and most loaded cpu and equalizing their load
  511  * by migrating some processes.
  512  *
  513  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
  514  * installations will only have 2 cpus.  Secondly, load balancing too much at
  515  * once can have an unpleasant effect on the system.  The scheduler rarely has
  516  * enough information to make perfect decisions.  So this algorithm chooses
  517  * algorithm simplicity and more gradual effects on load in larger systems.
  518  *
  519  * It could be improved by considering the priorities and slices assigned to
  520  * each task prior to balancing them.  There are many pathological cases with
  521  * any approach and so the semi random algorithm below may work as well as any.
  522  *
  523  */
  524 static void
  525 sched_balance(void)
  526 {
  527         struct kseq_group *high;
  528         struct kseq_group *low;
  529         struct kseq_group *ksg;
  530         int cnt;
  531         int i;
  532 
  533         if (smp_started == 0)
  534                 goto out;
  535         low = high = NULL;
  536         i = random() % (ksg_maxid + 1);
  537         for (cnt = 0; cnt <= ksg_maxid; cnt++) {
  538                 ksg = KSEQ_GROUP(i);
  539                 /*
  540                  * Find the CPU with the highest load that has some
  541                  * threads to transfer.
  542                  */
  543                 if ((high == NULL || ksg->ksg_load > high->ksg_load)
  544                     && ksg->ksg_transferable)
  545                         high = ksg;
  546                 if (low == NULL || ksg->ksg_load < low->ksg_load)
  547                         low = ksg;
  548                 if (++i > ksg_maxid)
  549                         i = 0;
  550         }
  551         if (low != NULL && high != NULL && high != low)
  552                 sched_balance_pair(LIST_FIRST(&high->ksg_members),
  553                     LIST_FIRST(&low->ksg_members));
  554 out:
  555         bal_tick = ticks + (random() % (hz * 2));
  556 }
  557 
  558 static void
  559 sched_balance_groups(void)
  560 {
  561         int i;
  562 
  563         mtx_assert(&sched_lock, MA_OWNED);
  564         if (smp_started)
  565                 for (i = 0; i <= ksg_maxid; i++)
  566                         sched_balance_group(KSEQ_GROUP(i));
  567         gbal_tick = ticks + (random() % (hz * 2));
  568 }
  569 
  570 static void
  571 sched_balance_group(struct kseq_group *ksg)
  572 {
  573         struct kseq *kseq;
  574         struct kseq *high;
  575         struct kseq *low;
  576         int load;
  577 
  578         if (ksg->ksg_transferable == 0)
  579                 return;
  580         low = NULL;
  581         high = NULL;
  582         LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
  583                 load = kseq->ksq_load;
  584                 if (high == NULL || load > high->ksq_load)
  585                         high = kseq;
  586                 if (low == NULL || load < low->ksq_load)
  587                         low = kseq;
  588         }
  589         if (high != NULL && low != NULL && high != low)
  590                 sched_balance_pair(high, low);
  591 }
  592 
  593 static void
  594 sched_balance_pair(struct kseq *high, struct kseq *low)
  595 {
  596         int transferable;
  597         int high_load;
  598         int low_load;
  599         int move;
  600         int diff;
  601         int i;
  602 
  603         /*
  604          * If we're transfering within a group we have to use this specific
  605          * kseq's transferable count, otherwise we can steal from other members
  606          * of the group.
  607          */
  608         if (high->ksq_group == low->ksq_group) {
  609                 transferable = high->ksq_transferable;
  610                 high_load = high->ksq_load;
  611                 low_load = low->ksq_load;
  612         } else {
  613                 transferable = high->ksq_group->ksg_transferable;
  614                 high_load = high->ksq_group->ksg_load;
  615                 low_load = low->ksq_group->ksg_load;
  616         }
  617         if (transferable == 0)
  618                 return;
  619         /*
  620          * Determine what the imbalance is and then adjust that to how many
  621          * kses we actually have to give up (transferable).
  622          */
  623         diff = high_load - low_load;
  624         move = diff / 2;
  625         if (diff & 0x1)
  626                 move++;
  627         move = min(move, transferable);
  628         for (i = 0; i < move; i++)
  629                 kseq_move(high, KSEQ_ID(low));
  630         return;
  631 }
  632 
  633 static void
  634 kseq_move(struct kseq *from, int cpu)
  635 {
  636         struct kseq *kseq;
  637         struct kseq *to;
  638         struct kse *ke;
  639 
  640         kseq = from;
  641         to = KSEQ_CPU(cpu);
  642         ke = kseq_steal(kseq, 1);
  643         if (ke == NULL) {
  644                 struct kseq_group *ksg;
  645 
  646                 ksg = kseq->ksq_group;
  647                 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
  648                         if (kseq == from || kseq->ksq_transferable == 0)
  649                                 continue;
  650                         ke = kseq_steal(kseq, 1);
  651                         break;
  652                 }
  653                 if (ke == NULL)
  654                         panic("kseq_move: No KSEs available with a "
  655                             "transferable count of %d\n", 
  656                             ksg->ksg_transferable);
  657         }
  658         if (kseq == to)
  659                 return;
  660         ke->ke_state = KES_THREAD;
  661         kseq_runq_rem(kseq, ke);
  662         kseq_load_rem(kseq, ke);
  663         kseq_notify(ke, cpu);
  664 }
  665 
  666 static int
  667 kseq_idled(struct kseq *kseq)
  668 {
  669         struct kseq_group *ksg;
  670         struct kseq *steal;
  671         struct kse *ke;
  672 
  673         ksg = kseq->ksq_group;
  674         /*
  675          * If we're in a cpu group, try and steal kses from another cpu in
  676          * the group before idling.
  677          */
  678         if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
  679                 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
  680                         if (steal == kseq || steal->ksq_transferable == 0)
  681                                 continue;
  682                         ke = kseq_steal(steal, 0);
  683                         if (ke == NULL)
  684                                 continue;
  685                         ke->ke_state = KES_THREAD;
  686                         kseq_runq_rem(steal, ke);
  687                         kseq_load_rem(steal, ke);
  688                         ke->ke_cpu = PCPU_GET(cpuid);
  689                         sched_add_internal(ke->ke_thread, 0);
  690                         return (0);
  691                 }
  692         }
  693         /*
  694          * We only set the idled bit when all of the cpus in the group are
  695          * idle.  Otherwise we could get into a situation where a KSE bounces
  696          * back and forth between two idle cores on seperate physical CPUs.
  697          */
  698         ksg->ksg_idlemask |= PCPU_GET(cpumask);
  699         if (ksg->ksg_idlemask != ksg->ksg_cpumask)
  700                 return (1);
  701         atomic_set_int(&kseq_idle, ksg->ksg_mask);
  702         return (1);
  703 }
  704 
  705 static void
  706 kseq_assign(struct kseq *kseq)
  707 {
  708         struct kse *nke;
  709         struct kse *ke;
  710 
  711         do {
  712                 *(volatile struct kse **)&ke = kseq->ksq_assigned;
  713         } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
  714         for (; ke != NULL; ke = nke) {
  715                 nke = ke->ke_assign;
  716                 ke->ke_flags &= ~KEF_ASSIGNED;
  717                 sched_add_internal(ke->ke_thread, 0);
  718         }
  719 }
  720 
  721 static void
  722 kseq_notify(struct kse *ke, int cpu)
  723 {
  724         struct kseq *kseq;
  725         struct thread *td;
  726         struct pcpu *pcpu;
  727         int prio;
  728 
  729         ke->ke_cpu = cpu;
  730         ke->ke_flags |= KEF_ASSIGNED;
  731         prio = ke->ke_thread->td_priority;
  732 
  733         kseq = KSEQ_CPU(cpu);
  734 
  735         /*
  736          * Place a KSE on another cpu's queue and force a resched.
  737          */
  738         do {
  739                 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
  740         } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
  741         /*
  742          * Without sched_lock we could lose a race where we set NEEDRESCHED
  743          * on a thread that is switched out before the IPI is delivered.  This
  744          * would lead us to miss the resched.  This will be a problem once
  745          * sched_lock is pushed down.
  746          */
  747         pcpu = pcpu_find(cpu);
  748         td = pcpu->pc_curthread;
  749         if (ke->ke_thread->td_priority < td->td_priority ||
  750             td == pcpu->pc_idlethread) {
  751                 td->td_flags |= TDF_NEEDRESCHED;
  752                 ipi_selected(1 << cpu, IPI_AST);
  753         }
  754 }
  755 
  756 static struct kse *
  757 runq_steal(struct runq *rq)
  758 {
  759         struct rqhead *rqh;
  760         struct rqbits *rqb;
  761         struct kse *ke;
  762         int word;
  763         int bit;
  764 
  765         mtx_assert(&sched_lock, MA_OWNED);
  766         rqb = &rq->rq_status;
  767         for (word = 0; word < RQB_LEN; word++) {
  768                 if (rqb->rqb_bits[word] == 0)
  769                         continue;
  770                 for (bit = 0; bit < RQB_BPW; bit++) {
  771                         if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
  772                                 continue;
  773                         rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
  774                         TAILQ_FOREACH(ke, rqh, ke_procq) {
  775                                 if (KSE_CAN_MIGRATE(ke,
  776                                     PRI_BASE(ke->ke_ksegrp->kg_pri_class)))
  777                                         return (ke);
  778                         }
  779                 }
  780         }
  781         return (NULL);
  782 }
  783 
  784 static struct kse *
  785 kseq_steal(struct kseq *kseq, int stealidle)
  786 {
  787         struct kse *ke;
  788 
  789         /*
  790          * Steal from next first to try to get a non-interactive task that
  791          * may not have run for a while.
  792          */
  793         if ((ke = runq_steal(kseq->ksq_next)) != NULL)
  794                 return (ke);
  795         if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
  796                 return (ke);
  797         if (stealidle)
  798                 return (runq_steal(&kseq->ksq_idle));
  799         return (NULL);
  800 }
  801 
  802 int
  803 kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
  804 {
  805         struct kseq_group *ksg;
  806         int cpu;
  807 
  808         if (smp_started == 0)
  809                 return (0);
  810         cpu = 0;
  811         /*
  812          * If our load exceeds a certain threshold we should attempt to
  813          * reassign this thread.  The first candidate is the cpu that
  814          * originally ran the thread.  If it is idle, assign it there, 
  815          * otherwise, pick an idle cpu.
  816          *
  817          * The threshold at which we start to reassign kses has a large impact
  818          * on the overall performance of the system.  Tuned too high and
  819          * some CPUs may idle.  Too low and there will be excess migration
  820          * and context switches.
  821          */
  822         ksg = kseq->ksq_group;
  823         if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) {
  824                 ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group;
  825                 if (kseq_idle & ksg->ksg_mask) {
  826                         cpu = ffs(ksg->ksg_idlemask);
  827                         if (cpu)
  828                                 goto migrate;
  829                 }
  830                 /*
  831                  * Multiple cpus could find this bit simultaneously
  832                  * but the race shouldn't be terrible.
  833                  */
  834                 cpu = ffs(kseq_idle);
  835                 if (cpu)
  836                         goto migrate;
  837         }
  838         /*
  839          * If another cpu in this group has idled, assign a thread over
  840          * to them after checking to see if there are idled groups.
  841          */
  842         ksg = kseq->ksq_group;
  843         if (ksg->ksg_idlemask) {
  844                 cpu = ffs(ksg->ksg_idlemask);
  845                 if (cpu)
  846                         goto migrate;
  847         }
  848         /*
  849          * No new CPU was found.
  850          */
  851         return (0);
  852 migrate:
  853         /*
  854          * Now that we've found an idle CPU, migrate the thread.
  855          */
  856         cpu--;
  857         ke->ke_runq = NULL;
  858         kseq_notify(ke, cpu);
  859 
  860         return (1);
  861 }
  862 
  863 #endif  /* SMP */
  864 
  865 /*
  866  * Pick the highest priority task we have and return it.
  867  */
  868 
  869 static struct kse *
  870 kseq_choose(struct kseq *kseq)
  871 {
  872         struct kse *ke;
  873         struct runq *swap;
  874 
  875         mtx_assert(&sched_lock, MA_OWNED);
  876         swap = NULL;
  877 
  878         for (;;) {
  879                 ke = runq_choose(kseq->ksq_curr);
  880                 if (ke == NULL) {
  881                         /*
  882                          * We already swapped once and didn't get anywhere.
  883                          */
  884                         if (swap)
  885                                 break;
  886                         swap = kseq->ksq_curr;
  887                         kseq->ksq_curr = kseq->ksq_next;
  888                         kseq->ksq_next = swap;
  889                         continue;
  890                 }
  891                 /*
  892                  * If we encounter a slice of 0 the kse is in a
  893                  * TIMESHARE kse group and its nice was too far out
  894                  * of the range that receives slices. 
  895                  */
  896                 if (ke->ke_slice == 0) {
  897                         runq_remove(ke->ke_runq, ke);
  898                         sched_slice(ke);
  899                         ke->ke_runq = kseq->ksq_next;
  900                         runq_add(ke->ke_runq, ke, 0);
  901                         continue;
  902                 }
  903                 return (ke);
  904         }
  905 
  906         return (runq_choose(&kseq->ksq_idle));
  907 }
  908 
  909 static void
  910 kseq_setup(struct kseq *kseq)
  911 {
  912         runq_init(&kseq->ksq_timeshare[0]);
  913         runq_init(&kseq->ksq_timeshare[1]);
  914         runq_init(&kseq->ksq_idle);
  915         kseq->ksq_curr = &kseq->ksq_timeshare[0];
  916         kseq->ksq_next = &kseq->ksq_timeshare[1];
  917         kseq->ksq_load = 0;
  918         kseq->ksq_load_timeshare = 0;
  919 }
  920 
  921 static void
  922 sched_setup(void *dummy)
  923 {
  924 #ifdef SMP
  925         int balance_groups;
  926         int i;
  927 #endif
  928 
  929         slice_min = (hz/100);   /* 10ms */
  930         slice_max = (hz/7);     /* ~140ms */
  931 
  932 #ifdef SMP
  933         balance_groups = 0;
  934         /*
  935          * Initialize the kseqs.
  936          */
  937         for (i = 0; i < MAXCPU; i++) {
  938                 struct kseq *ksq;
  939 
  940                 ksq = &kseq_cpu[i];
  941                 ksq->ksq_assigned = NULL;
  942                 kseq_setup(&kseq_cpu[i]);
  943         }
  944         if (smp_topology == NULL) {
  945                 struct kseq_group *ksg;
  946                 struct kseq *ksq;
  947 
  948                 for (i = 0; i < MAXCPU; i++) {
  949                         ksq = &kseq_cpu[i];
  950                         ksg = &kseq_groups[i];
  951                         /*
  952                          * Setup a kseq group with one member.
  953                          */
  954                         ksq->ksq_transferable = 0;
  955                         ksq->ksq_group = ksg;
  956                         ksg->ksg_cpus = 1;
  957                         ksg->ksg_idlemask = 0;
  958                         ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
  959                         ksg->ksg_load = 0;
  960                         ksg->ksg_transferable = 0;
  961                         LIST_INIT(&ksg->ksg_members);
  962                         LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
  963                 }
  964         } else {
  965                 struct kseq_group *ksg;
  966                 struct cpu_group *cg;
  967                 int j;
  968 
  969                 for (i = 0; i < smp_topology->ct_count; i++) {
  970                         cg = &smp_topology->ct_group[i];
  971                         ksg = &kseq_groups[i];
  972                         /*
  973                          * Initialize the group.
  974                          */
  975                         ksg->ksg_idlemask = 0;
  976                         ksg->ksg_load = 0;
  977                         ksg->ksg_transferable = 0;
  978                         ksg->ksg_cpus = cg->cg_count;
  979                         ksg->ksg_cpumask = cg->cg_mask;
  980                         LIST_INIT(&ksg->ksg_members);
  981                         /*
  982                          * Find all of the group members and add them.
  983                          */
  984                         for (j = 0; j < MAXCPU; j++) {
  985                                 if ((cg->cg_mask & (1 << j)) != 0) {
  986                                         if (ksg->ksg_mask == 0)
  987                                                 ksg->ksg_mask = 1 << j;
  988                                         kseq_cpu[j].ksq_transferable = 0;
  989                                         kseq_cpu[j].ksq_group = ksg;
  990                                         LIST_INSERT_HEAD(&ksg->ksg_members,
  991                                             &kseq_cpu[j], ksq_siblings);
  992                                 }
  993                         }
  994                         if (ksg->ksg_cpus > 1)
  995                                 balance_groups = 1;
  996                 }
  997                 ksg_maxid = smp_topology->ct_count - 1;
  998         }
  999         /*
 1000          * Stagger the group and global load balancer so they do not
 1001          * interfere with each other.
 1002          */
 1003         bal_tick = ticks + hz;
 1004         if (balance_groups)
 1005                 gbal_tick = ticks + (hz / 2);
 1006 #else
 1007         kseq_setup(KSEQ_SELF());
 1008 #endif
 1009         mtx_lock_spin(&sched_lock);
 1010         kseq_load_add(KSEQ_SELF(), &kse0);
 1011         mtx_unlock_spin(&sched_lock);
 1012 }
 1013 
 1014 /*
 1015  * Scale the scheduling priority according to the "interactivity" of this
 1016  * process.
 1017  */
 1018 static void
 1019 sched_priority(struct ksegrp *kg)
 1020 {
 1021         int pri;
 1022 
 1023         if (kg->kg_pri_class != PRI_TIMESHARE)
 1024                 return;
 1025 
 1026         pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
 1027         pri += SCHED_PRI_BASE;
 1028         pri += kg->kg_proc->p_nice;
 1029 
 1030         if (pri > PRI_MAX_TIMESHARE)
 1031                 pri = PRI_MAX_TIMESHARE;
 1032         else if (pri < PRI_MIN_TIMESHARE)
 1033                 pri = PRI_MIN_TIMESHARE;
 1034 
 1035         kg->kg_user_pri = pri;
 1036 
 1037         return;
 1038 }
 1039 
 1040 /*
 1041  * Calculate a time slice based on the properties of the kseg and the runq
 1042  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
 1043  */
 1044 static void
 1045 sched_slice(struct kse *ke)
 1046 {
 1047         struct kseq *kseq;
 1048         struct ksegrp *kg;
 1049 
 1050         kg = ke->ke_ksegrp;
 1051         kseq = KSEQ_CPU(ke->ke_cpu);
 1052 
 1053         /*
 1054          * Rationale:
 1055          * KSEs in interactive ksegs get a minimal slice so that we
 1056          * quickly notice if it abuses its advantage.
 1057          *
 1058          * KSEs in non-interactive ksegs are assigned a slice that is
 1059          * based on the ksegs nice value relative to the least nice kseg
 1060          * on the run queue for this cpu.
 1061          *
 1062          * If the KSE is less nice than all others it gets the maximum
 1063          * slice and other KSEs will adjust their slice relative to
 1064          * this when they first expire.
 1065          *
 1066          * There is 20 point window that starts relative to the least
 1067          * nice kse on the run queue.  Slice size is determined by
 1068          * the kse distance from the last nice ksegrp.
 1069          *
 1070          * If the kse is outside of the window it will get no slice
 1071          * and will be reevaluated each time it is selected on the
 1072          * run queue.  The exception to this is nice 0 ksegs when
 1073          * a nice -20 is running.  They are always granted a minimum
 1074          * slice.
 1075          */
 1076         if (!SCHED_INTERACTIVE(kg)) {
 1077                 int nice;
 1078 
 1079                 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
 1080                 if (kseq->ksq_load_timeshare == 0 ||
 1081                     kg->kg_proc->p_nice < kseq->ksq_nicemin)
 1082                         ke->ke_slice = SCHED_SLICE_MAX;
 1083                 else if (nice <= SCHED_SLICE_NTHRESH)
 1084                         ke->ke_slice = SCHED_SLICE_NICE(nice);
 1085                 else if (kg->kg_proc->p_nice == 0)
 1086                         ke->ke_slice = SCHED_SLICE_MIN;
 1087                 else
 1088                         ke->ke_slice = 0;
 1089         } else
 1090                 ke->ke_slice = SCHED_SLICE_INTERACTIVE;
 1091 
 1092         CTR6(KTR_ULE,
 1093             "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
 1094             ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin,
 1095             kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg));
 1096 
 1097         return;
 1098 }
 1099 
 1100 /*
 1101  * This routine enforces a maximum limit on the amount of scheduling history
 1102  * kept.  It is called after either the slptime or runtime is adjusted.
 1103  * This routine will not operate correctly when slp or run times have been
 1104  * adjusted to more than double their maximum.
 1105  */
 1106 static void
 1107 sched_interact_update(struct ksegrp *kg)
 1108 {
 1109         int sum;
 1110 
 1111         sum = kg->kg_runtime + kg->kg_slptime;
 1112         if (sum < SCHED_SLP_RUN_MAX)
 1113                 return;
 1114         /*
 1115          * If we have exceeded by more than 1/5th then the algorithm below
 1116          * will not bring us back into range.  Dividing by two here forces
 1117          * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 1118          */
 1119         if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 1120                 kg->kg_runtime /= 2;
 1121                 kg->kg_slptime /= 2;
 1122                 return;
 1123         }
 1124         kg->kg_runtime = (kg->kg_runtime / 5) * 4;
 1125         kg->kg_slptime = (kg->kg_slptime / 5) * 4;
 1126 }
 1127 
 1128 static void
 1129 sched_interact_fork(struct ksegrp *kg)
 1130 {
 1131         int ratio;
 1132         int sum;
 1133 
 1134         sum = kg->kg_runtime + kg->kg_slptime;
 1135         if (sum > SCHED_SLP_RUN_FORK) {
 1136                 ratio = sum / SCHED_SLP_RUN_FORK;
 1137                 kg->kg_runtime /= ratio;
 1138                 kg->kg_slptime /= ratio;
 1139         }
 1140 }
 1141 
 1142 static int
 1143 sched_interact_score(struct ksegrp *kg)
 1144 {
 1145         int div;
 1146 
 1147         if (kg->kg_runtime > kg->kg_slptime) {
 1148                 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
 1149                 return (SCHED_INTERACT_HALF +
 1150                     (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
 1151         } if (kg->kg_slptime > kg->kg_runtime) {
 1152                 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
 1153                 return (kg->kg_runtime / div);
 1154         }
 1155 
 1156         /*
 1157          * This can happen if slptime and runtime are 0.
 1158          */
 1159         return (0);
 1160 
 1161 }
 1162 
 1163 /*
 1164  * Very early in the boot some setup of scheduler-specific
 1165  * parts of proc0 and of soem scheduler resources needs to be done.
 1166  * Called from:
 1167  *  proc0_init()
 1168  */
 1169 void
 1170 schedinit(void)
 1171 {
 1172         /*
 1173          * Set up the scheduler specific parts of proc0.
 1174          */
 1175         proc0.p_sched = NULL; /* XXX */
 1176         ksegrp0.kg_sched = &kg_sched0;
 1177         thread0.td_sched = &kse0;
 1178         kse0.ke_thread = &thread0;
 1179         kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */
 1180         kse0.ke_state = KES_THREAD;
 1181         kg_sched0.skg_concurrency = 1;
 1182         kg_sched0.skg_avail_opennings = 0; /* we are already running */
 1183 }
 1184 
 1185 /*
 1186  * This is only somewhat accurate since given many processes of the same
 1187  * priority they will switch when their slices run out, which will be
 1188  * at most SCHED_SLICE_MAX.
 1189  */
 1190 int
 1191 sched_rr_interval(void)
 1192 {
 1193         return (SCHED_SLICE_MAX);
 1194 }
 1195 
 1196 static void
 1197 sched_pctcpu_update(struct kse *ke)
 1198 {
 1199         /*
 1200          * Adjust counters and watermark for pctcpu calc.
 1201          */
 1202         if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
 1203                 /*
 1204                  * Shift the tick count out so that the divide doesn't
 1205                  * round away our results.
 1206                  */
 1207                 ke->ke_ticks <<= 10;
 1208                 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
 1209                             SCHED_CPU_TICKS;
 1210                 ke->ke_ticks >>= 10;
 1211         } else
 1212                 ke->ke_ticks = 0;
 1213         ke->ke_ltick = ticks;
 1214         ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 1215 }
 1216 
 1217 void
 1218 sched_prio(struct thread *td, u_char prio)
 1219 {
 1220         struct kse *ke;
 1221 
 1222         ke = td->td_kse;
 1223         mtx_assert(&sched_lock, MA_OWNED);
 1224         if (TD_ON_RUNQ(td)) {
 1225                 /*
 1226                  * If the priority has been elevated due to priority
 1227                  * propagation, we may have to move ourselves to a new
 1228                  * queue.  We still call adjustrunqueue below in case kse
 1229                  * needs to fix things up.
 1230                  */
 1231                 if (prio < td->td_priority && ke &&
 1232                     (ke->ke_flags & KEF_ASSIGNED) == 0 &&
 1233                     ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
 1234                         runq_remove(ke->ke_runq, ke);
 1235                         ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
 1236                         runq_add(ke->ke_runq, ke, 0);
 1237                 }
 1238                 /*
 1239                  * Hold this kse on this cpu so that sched_prio() doesn't
 1240                  * cause excessive migration.  We only want migration to
 1241                  * happen as the result of a wakeup.
 1242                  */
 1243                 ke->ke_flags |= KEF_HOLD;
 1244                 adjustrunqueue(td, prio);
 1245         } else
 1246                 td->td_priority = prio;
 1247 }
 1248 
 1249 void
 1250 sched_switch(struct thread *td, struct thread *newtd, int flags)
 1251 {
 1252         struct kse *ke;
 1253 
 1254         mtx_assert(&sched_lock, MA_OWNED);
 1255 
 1256         ke = td->td_kse;
 1257 
 1258         td->td_lastcpu = td->td_oncpu;
 1259         td->td_oncpu = NOCPU;
 1260         td->td_flags &= ~TDF_NEEDRESCHED;
 1261         td->td_pflags &= ~TDP_OWEPREEMPT;
 1262 
 1263         /*
 1264          * If the KSE has been assigned it may be in the process of switching
 1265          * to the new cpu.  This is the case in sched_bind().
 1266          */
 1267         if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
 1268                 if (td == PCPU_GET(idlethread)) {
 1269                         TD_SET_CAN_RUN(td);
 1270                 } else {
 1271                         /* We are ending our run so make our slot available again */
 1272                         SLOT_RELEASE(td->td_ksegrp);
 1273                         if (TD_IS_RUNNING(td)) {
 1274                                 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 1275                                 /*
 1276                                  * Don't allow the thread to migrate
 1277                                  * from a preemption.
 1278                                  */
 1279                                 ke->ke_flags |= KEF_HOLD;
 1280                                 setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
 1281                         } else {
 1282                                 if (ke->ke_runq) {
 1283                                         kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 1284                                 } else if ((td->td_flags & TDF_IDLETD) == 0)
 1285                                         kdb_backtrace();
 1286                                 /*
 1287                                  * We will not be on the run queue.
 1288                                  * So we must be sleeping or similar.
 1289                                  * Don't use the slot if we will need it 
 1290                                  * for newtd.
 1291                                  */
 1292                                 if ((td->td_proc->p_flag & P_HADTHREADS) &&
 1293                                     (newtd == NULL ||
 1294                                     newtd->td_ksegrp != td->td_ksegrp))
 1295                                         slot_fill(td->td_ksegrp);
 1296                         }
 1297                 }
 1298         }
 1299         if (newtd != NULL) {
 1300                 /*
 1301                  * If we bring in a thread, 
 1302                  * then account for it as if it had been added to the
 1303                  * run queue and then chosen.
 1304                  */
 1305                 newtd->td_kse->ke_flags |= KEF_DIDRUN;
 1306                 SLOT_USE(newtd->td_ksegrp);
 1307                 TD_SET_RUNNING(newtd);
 1308                 kseq_load_add(KSEQ_SELF(), newtd->td_kse);
 1309         } else
 1310                 newtd = choosethread();
 1311         if (td != newtd)
 1312                 cpu_switch(td, newtd);
 1313         sched_lock.mtx_lock = (uintptr_t)td;
 1314 
 1315         td->td_oncpu = PCPU_GET(cpuid);
 1316 }
 1317 
 1318 void
 1319 sched_nice(struct proc *p, int nice)
 1320 {
 1321         struct ksegrp *kg;
 1322         struct kse *ke;
 1323         struct thread *td;
 1324         struct kseq *kseq;
 1325 
 1326         PROC_LOCK_ASSERT(p, MA_OWNED);
 1327         mtx_assert(&sched_lock, MA_OWNED);
 1328         /*
 1329          * We need to adjust the nice counts for running KSEs.
 1330          */
 1331         FOREACH_KSEGRP_IN_PROC(p, kg) {
 1332                 if (kg->kg_pri_class == PRI_TIMESHARE) {
 1333                         FOREACH_THREAD_IN_GROUP(kg, td) {
 1334                                 ke = td->td_kse;
 1335                                 if (ke->ke_runq == NULL)
 1336                                         continue;
 1337                                 kseq = KSEQ_CPU(ke->ke_cpu);
 1338                                 kseq_nice_rem(kseq, p->p_nice);
 1339                                 kseq_nice_add(kseq, nice);
 1340                         }
 1341                 }
 1342         }
 1343         p->p_nice = nice;
 1344         FOREACH_KSEGRP_IN_PROC(p, kg) {
 1345                 sched_priority(kg);
 1346                 FOREACH_THREAD_IN_GROUP(kg, td)
 1347                         td->td_flags |= TDF_NEEDRESCHED;
 1348         }
 1349 }
 1350 
 1351 void
 1352 sched_sleep(struct thread *td)
 1353 {
 1354         mtx_assert(&sched_lock, MA_OWNED);
 1355 
 1356         td->td_slptime = ticks;
 1357         td->td_base_pri = td->td_priority;
 1358 
 1359         CTR2(KTR_ULE, "sleep thread %p (tick: %d)",
 1360             td, td->td_slptime);
 1361 }
 1362 
 1363 void
 1364 sched_wakeup(struct thread *td)
 1365 {
 1366         mtx_assert(&sched_lock, MA_OWNED);
 1367 
 1368         /*
 1369          * Let the kseg know how long we slept for.  This is because process
 1370          * interactivity behavior is modeled in the kseg.
 1371          */
 1372         if (td->td_slptime) {
 1373                 struct ksegrp *kg;
 1374                 int hzticks;
 1375 
 1376                 kg = td->td_ksegrp;
 1377                 hzticks = (ticks - td->td_slptime) << 10;
 1378                 if (hzticks >= SCHED_SLP_RUN_MAX) {
 1379                         kg->kg_slptime = SCHED_SLP_RUN_MAX;
 1380                         kg->kg_runtime = 1;
 1381                 } else {
 1382                         kg->kg_slptime += hzticks;
 1383                         sched_interact_update(kg);
 1384                 }
 1385                 sched_priority(kg);
 1386                 sched_slice(td->td_kse);
 1387                 CTR2(KTR_ULE, "wakeup thread %p (%d ticks)", td, hzticks);
 1388                 td->td_slptime = 0;
 1389         }
 1390         setrunqueue(td, SRQ_BORING);
 1391 }
 1392 
 1393 /*
 1394  * Penalize the parent for creating a new child and initialize the child's
 1395  * priority.
 1396  */
 1397 void
 1398 sched_fork(struct thread *td, struct thread *childtd)
 1399 {
 1400 
 1401         mtx_assert(&sched_lock, MA_OWNED);
 1402 
 1403         sched_fork_ksegrp(td, childtd->td_ksegrp);
 1404         sched_fork_thread(td, childtd);
 1405 }
 1406 
 1407 void
 1408 sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
 1409 {
 1410         struct ksegrp *kg = td->td_ksegrp;
 1411         mtx_assert(&sched_lock, MA_OWNED);
 1412 
 1413         child->kg_slptime = kg->kg_slptime;
 1414         child->kg_runtime = kg->kg_runtime;
 1415         child->kg_user_pri = kg->kg_user_pri;
 1416         sched_interact_fork(child);
 1417         kg->kg_runtime += tickincr << 10;
 1418         sched_interact_update(kg);
 1419 
 1420         CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)",
 1421             kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 
 1422             child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime);
 1423 }
 1424 
 1425 void
 1426 sched_fork_thread(struct thread *td, struct thread *child)
 1427 {
 1428         struct kse *ke;
 1429         struct kse *ke2;
 1430 
 1431         sched_newthread(child);
 1432         ke = td->td_kse;
 1433         ke2 = child->td_kse;
 1434         ke2->ke_slice = 1;      /* Attempt to quickly learn interactivity. */
 1435         ke2->ke_cpu = ke->ke_cpu;
 1436         ke2->ke_runq = NULL;
 1437 
 1438         /* Grab our parents cpu estimation information. */
 1439         ke2->ke_ticks = ke->ke_ticks;
 1440         ke2->ke_ltick = ke->ke_ltick;
 1441         ke2->ke_ftick = ke->ke_ftick;
 1442 }
 1443 
 1444 void
 1445 sched_class(struct ksegrp *kg, int class)
 1446 {
 1447         struct kseq *kseq;
 1448         struct kse *ke;
 1449         struct thread *td;
 1450         int nclass;
 1451         int oclass;
 1452 
 1453         mtx_assert(&sched_lock, MA_OWNED);
 1454         if (kg->kg_pri_class == class)
 1455                 return;
 1456 
 1457         nclass = PRI_BASE(class);
 1458         oclass = PRI_BASE(kg->kg_pri_class);
 1459         FOREACH_THREAD_IN_GROUP(kg, td) {
 1460                 ke = td->td_kse;
 1461                 if (ke->ke_state != KES_ONRUNQ &&
 1462                     ke->ke_state != KES_THREAD)
 1463                         continue;
 1464                 kseq = KSEQ_CPU(ke->ke_cpu);
 1465 
 1466 #ifdef SMP
 1467                 /*
 1468                  * On SMP if we're on the RUNQ we must adjust the transferable
 1469                  * count because could be changing to or from an interrupt
 1470                  * class.
 1471                  */
 1472                 if (ke->ke_state == KES_ONRUNQ) {
 1473                         if (KSE_CAN_MIGRATE(ke, oclass)) {
 1474                                 kseq->ksq_transferable--;
 1475                                 kseq->ksq_group->ksg_transferable--;
 1476                         }
 1477                         if (KSE_CAN_MIGRATE(ke, nclass)) {
 1478                                 kseq->ksq_transferable++;
 1479                                 kseq->ksq_group->ksg_transferable++;
 1480                         }
 1481                 }
 1482 #endif
 1483                 if (oclass == PRI_TIMESHARE) {
 1484                         kseq->ksq_load_timeshare--;
 1485                         kseq_nice_rem(kseq, kg->kg_proc->p_nice);
 1486                 }
 1487                 if (nclass == PRI_TIMESHARE) {
 1488                         kseq->ksq_load_timeshare++;
 1489                         kseq_nice_add(kseq, kg->kg_proc->p_nice);
 1490                 }
 1491         }
 1492 
 1493         kg->kg_pri_class = class;
 1494 }
 1495 
 1496 /*
 1497  * Return some of the child's priority and interactivity to the parent.
 1498  * Avoid using sched_exit_thread to avoid having to decide which
 1499  * thread in the parent gets the honour since it isn't used.
 1500  */
 1501 void
 1502 sched_exit(struct proc *p, struct thread *childtd)
 1503 {
 1504         mtx_assert(&sched_lock, MA_OWNED);
 1505         sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
 1506         kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
 1507 }
 1508 
 1509 void
 1510 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
 1511 {
 1512         /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
 1513         kg->kg_runtime += td->td_ksegrp->kg_runtime;
 1514         sched_interact_update(kg);
 1515 }
 1516 
 1517 void
 1518 sched_exit_thread(struct thread *td, struct thread *childtd)
 1519 {
 1520         kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
 1521 }
 1522 
 1523 void
 1524 sched_clock(struct thread *td)
 1525 {
 1526         struct kseq *kseq;
 1527         struct ksegrp *kg;
 1528         struct kse *ke;
 1529 
 1530         mtx_assert(&sched_lock, MA_OWNED);
 1531         kseq = KSEQ_SELF();
 1532 #ifdef SMP
 1533         if (ticks == bal_tick)
 1534                 sched_balance();
 1535         if (ticks == gbal_tick)
 1536                 sched_balance_groups();
 1537         /*
 1538          * We could have been assigned a non real-time thread without an
 1539          * IPI.
 1540          */
 1541         if (kseq->ksq_assigned)
 1542                 kseq_assign(kseq);      /* Potentially sets NEEDRESCHED */
 1543 #endif
 1544         /*
 1545          * sched_setup() apparently happens prior to stathz being set.  We
 1546          * need to resolve the timers earlier in the boot so we can avoid
 1547          * calculating this here.
 1548          */
 1549         if (realstathz == 0) {
 1550                 realstathz = stathz ? stathz : hz;
 1551                 tickincr = hz / realstathz;
 1552                 /*
 1553                  * XXX This does not work for values of stathz that are much
 1554                  * larger than hz.
 1555                  */
 1556                 if (tickincr == 0)
 1557                         tickincr = 1;
 1558         }
 1559 
 1560         ke = td->td_kse;
 1561         kg = ke->ke_ksegrp;
 1562 
 1563         /* Adjust ticks for pctcpu */
 1564         ke->ke_ticks++;
 1565         ke->ke_ltick = ticks;
 1566 
 1567         /* Go up to one second beyond our max and then trim back down */
 1568         if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
 1569                 sched_pctcpu_update(ke);
 1570 
 1571         if (td->td_flags & TDF_IDLETD)
 1572                 return;
 1573 
 1574         CTR4(KTR_ULE, "Tick thread %p (slice: %d, slptime: %d, runtime: %d)",
 1575             td, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
 1576         /*
 1577          * We only do slicing code for TIMESHARE ksegrps.
 1578          */
 1579         if (kg->kg_pri_class != PRI_TIMESHARE)
 1580                 return;
 1581         /*
 1582          * We used a tick charge it to the ksegrp so that we can compute our
 1583          * interactivity.
 1584          */
 1585         kg->kg_runtime += tickincr << 10;
 1586         sched_interact_update(kg);
 1587 
 1588         /*
 1589          * We used up one time slice.
 1590          */
 1591         if (--ke->ke_slice > 0)
 1592                 return;
 1593         /*
 1594          * We're out of time, recompute priorities and requeue.
 1595          */
 1596         kseq_load_rem(kseq, ke);
 1597         sched_priority(kg);
 1598         sched_slice(ke);
 1599         if (SCHED_CURR(kg, ke))
 1600                 ke->ke_runq = kseq->ksq_curr;
 1601         else
 1602                 ke->ke_runq = kseq->ksq_next;
 1603         kseq_load_add(kseq, ke);
 1604         td->td_flags |= TDF_NEEDRESCHED;
 1605 }
 1606 
 1607 int
 1608 sched_runnable(void)
 1609 {
 1610         struct kseq *kseq;
 1611         int load;
 1612 
 1613         load = 1;
 1614 
 1615         kseq = KSEQ_SELF();
 1616 #ifdef SMP
 1617         if (kseq->ksq_assigned) {
 1618                 mtx_lock_spin(&sched_lock);
 1619                 kseq_assign(kseq);
 1620                 mtx_unlock_spin(&sched_lock);
 1621         }
 1622 #endif
 1623         if ((curthread->td_flags & TDF_IDLETD) != 0) {
 1624                 if (kseq->ksq_load > 0)
 1625                         goto out;
 1626         } else
 1627                 if (kseq->ksq_load - 1 > 0)
 1628                         goto out;
 1629         load = 0;
 1630 out:
 1631         return (load);
 1632 }
 1633 
 1634 void
 1635 sched_userret(struct thread *td)
 1636 {
 1637         struct ksegrp *kg;
 1638 
 1639         kg = td->td_ksegrp;
 1640         
 1641         if (td->td_priority != kg->kg_user_pri) {
 1642                 mtx_lock_spin(&sched_lock);
 1643                 td->td_priority = kg->kg_user_pri;
 1644                 mtx_unlock_spin(&sched_lock);
 1645         }
 1646 }
 1647 
 1648 struct kse *
 1649 sched_choose(void)
 1650 {
 1651         struct kseq *kseq;
 1652         struct kse *ke;
 1653 
 1654         mtx_assert(&sched_lock, MA_OWNED);
 1655         kseq = KSEQ_SELF();
 1656 #ifdef SMP
 1657 restart:
 1658         if (kseq->ksq_assigned)
 1659                 kseq_assign(kseq);
 1660 #endif
 1661         ke = kseq_choose(kseq);
 1662         if (ke) {
 1663 #ifdef SMP
 1664                 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
 1665                         if (kseq_idled(kseq) == 0)
 1666                                 goto restart;
 1667 #endif
 1668                 kseq_runq_rem(kseq, ke);
 1669                 ke->ke_state = KES_THREAD;
 1670 
 1671                 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
 1672                         CTR4(KTR_ULE, "Run thread %p from %p (slice: %d, pri: %d)",
 1673                             ke->ke_thread, ke->ke_runq, ke->ke_slice,
 1674                             ke->ke_thread->td_priority);
 1675                 }
 1676                 return (ke);
 1677         }
 1678 #ifdef SMP
 1679         if (kseq_idled(kseq) == 0)
 1680                 goto restart;
 1681 #endif
 1682         return (NULL);
 1683 }
 1684 
 1685 void
 1686 sched_add(struct thread *td, int flags)
 1687 {
 1688 
 1689         /* let jeff work out how to map the flags better */
 1690         /* I'm open to suggestions */
 1691         if (flags & SRQ_YIELDING)
 1692                 /*
 1693                  * Preempting during switching can be bad JUJU
 1694                  * especially for KSE processes
 1695                  */
 1696                 sched_add_internal(td, 0);
 1697         else
 1698                 sched_add_internal(td, 1);
 1699 }
 1700 
 1701 static void
 1702 sched_add_internal(struct thread *td, int preemptive)
 1703 {
 1704         struct kseq *kseq;
 1705         struct ksegrp *kg;
 1706         struct kse *ke;
 1707 #ifdef SMP
 1708         int canmigrate;
 1709 #endif
 1710         int class;
 1711 
 1712         mtx_assert(&sched_lock, MA_OWNED);
 1713         ke = td->td_kse;
 1714         kg = td->td_ksegrp;
 1715         if (ke->ke_flags & KEF_ASSIGNED)
 1716                 return;
 1717         kseq = KSEQ_SELF();
 1718         KASSERT(ke->ke_state != KES_ONRUNQ,
 1719             ("sched_add: kse %p (%s) already in run queue", ke,
 1720             ke->ke_proc->p_comm));
 1721         KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 1722             ("sched_add: process swapped out"));
 1723         KASSERT(ke->ke_runq == NULL,
 1724             ("sched_add: KSE %p is still assigned to a run queue", ke));
 1725 
 1726         class = PRI_BASE(kg->kg_pri_class);
 1727         switch (class) {
 1728         case PRI_ITHD:
 1729         case PRI_REALTIME:
 1730                 ke->ke_runq = kseq->ksq_curr;
 1731                 ke->ke_slice = SCHED_SLICE_MAX;
 1732                 ke->ke_cpu = PCPU_GET(cpuid);
 1733                 break;
 1734         case PRI_TIMESHARE:
 1735                 if (SCHED_CURR(kg, ke))
 1736                         ke->ke_runq = kseq->ksq_curr;
 1737                 else
 1738                         ke->ke_runq = kseq->ksq_next;
 1739                 break;
 1740         case PRI_IDLE:
 1741                 /*
 1742                  * This is for priority prop.
 1743                  */
 1744                 if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
 1745                         ke->ke_runq = kseq->ksq_curr;
 1746                 else
 1747                         ke->ke_runq = &kseq->ksq_idle;
 1748                 ke->ke_slice = SCHED_SLICE_MIN;
 1749                 break;
 1750         default:
 1751                 panic("Unknown pri class.");
 1752                 break;
 1753         }
 1754 #ifdef SMP
 1755         /*
 1756          * Don't migrate running threads here.  Force the long term balancer
 1757          * to do it.
 1758          */
 1759         canmigrate = KSE_CAN_MIGRATE(ke, class);
 1760         if (ke->ke_flags & KEF_HOLD) {
 1761                 ke->ke_flags &= ~KEF_HOLD;
 1762                 canmigrate = 0;
 1763         }
 1764         /*
 1765          * If this thread is pinned or bound, notify the target cpu.
 1766          */
 1767         if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
 1768                 ke->ke_runq = NULL;
 1769                 kseq_notify(ke, ke->ke_cpu);
 1770                 return;
 1771         }
 1772         /*
 1773          * If we had been idle, clear our bit in the group and potentially
 1774          * the global bitmap.  If not, see if we should transfer this thread.
 1775          */
 1776         if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
 1777             (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
 1778                 /*
 1779                  * Check to see if our group is unidling, and if so, remove it
 1780                  * from the global idle mask.
 1781                  */
 1782                 if (kseq->ksq_group->ksg_idlemask ==
 1783                     kseq->ksq_group->ksg_cpumask)
 1784                         atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
 1785                 /*
 1786                  * Now remove ourselves from the group specific idle mask.
 1787                  */
 1788                 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
 1789         } else if (kseq->ksq_load > 1 && canmigrate)
 1790                 if (kseq_transfer(kseq, ke, class))
 1791                         return;
 1792         ke->ke_cpu = PCPU_GET(cpuid);
 1793 #endif
 1794         /*
 1795          * XXX With preemption this is not necessary.
 1796          */
 1797         if (td->td_priority < curthread->td_priority &&
 1798             ke->ke_runq == kseq->ksq_curr)
 1799                 curthread->td_flags |= TDF_NEEDRESCHED;
 1800         if (preemptive && maybe_preempt(td))
 1801                 return;
 1802         SLOT_USE(td->td_ksegrp);
 1803         ke->ke_ksegrp->kg_runq_threads++;
 1804         ke->ke_state = KES_ONRUNQ;
 1805 
 1806         kseq_runq_add(kseq, ke);
 1807         kseq_load_add(kseq, ke);
 1808 }
 1809 
 1810 void
 1811 sched_rem(struct thread *td)
 1812 {
 1813         struct kseq *kseq;
 1814         struct kse *ke;
 1815 
 1816         ke = td->td_kse;
 1817         /*
 1818          * It is safe to just return here because sched_rem() is only ever
 1819          * used in places where we're immediately going to add the
 1820          * kse back on again.  In that case it'll be added with the correct
 1821          * thread and priority when the caller drops the sched_lock.
 1822          */
 1823         if (ke->ke_flags & KEF_ASSIGNED)
 1824                 return;
 1825         mtx_assert(&sched_lock, MA_OWNED);
 1826         KASSERT((ke->ke_state == KES_ONRUNQ),
 1827             ("sched_rem: KSE not on run queue"));
 1828 
 1829         ke->ke_state = KES_THREAD;
 1830         SLOT_RELEASE(td->td_ksegrp);
 1831         ke->ke_ksegrp->kg_runq_threads--;
 1832         kseq = KSEQ_CPU(ke->ke_cpu);
 1833         kseq_runq_rem(kseq, ke);
 1834         kseq_load_rem(kseq, ke);
 1835 }
 1836 
 1837 fixpt_t
 1838 sched_pctcpu(struct thread *td)
 1839 {
 1840         fixpt_t pctcpu;
 1841         struct kse *ke;
 1842 
 1843         pctcpu = 0;
 1844         ke = td->td_kse;
 1845         if (ke == NULL)
 1846                 return (0);
 1847 
 1848         mtx_lock_spin(&sched_lock);
 1849         if (ke->ke_ticks) {
 1850                 int rtick;
 1851 
 1852                 /*
 1853                  * Don't update more frequently than twice a second.  Allowing
 1854                  * this causes the cpu usage to decay away too quickly due to
 1855                  * rounding errors.
 1856                  */
 1857                 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
 1858                     ke->ke_ltick < (ticks - (hz / 2)))
 1859                         sched_pctcpu_update(ke);
 1860                 /* How many rtick per second ? */
 1861                 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
 1862                 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
 1863         }
 1864 
 1865         ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
 1866         mtx_unlock_spin(&sched_lock);
 1867 
 1868         return (pctcpu);
 1869 }
 1870 
 1871 void
 1872 sched_bind(struct thread *td, int cpu)
 1873 {
 1874         struct kse *ke;
 1875 
 1876         mtx_assert(&sched_lock, MA_OWNED);
 1877         ke = td->td_kse;
 1878         ke->ke_flags |= KEF_BOUND;
 1879 #ifdef SMP
 1880         if (PCPU_GET(cpuid) == cpu)
 1881                 return;
 1882         /* sched_rem without the runq_remove */
 1883         ke->ke_state = KES_THREAD;
 1884         ke->ke_ksegrp->kg_runq_threads--;
 1885         kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 1886         kseq_notify(ke, cpu);
 1887         /* When we return from mi_switch we'll be on the correct cpu. */
 1888         mi_switch(SW_VOL, NULL);
 1889 #endif
 1890 }
 1891 
 1892 void
 1893 sched_unbind(struct thread *td)
 1894 {
 1895         mtx_assert(&sched_lock, MA_OWNED);
 1896         td->td_kse->ke_flags &= ~KEF_BOUND;
 1897 }
 1898 
 1899 int
 1900 sched_load(void)
 1901 {
 1902 #ifdef SMP
 1903         int total;
 1904         int i;
 1905 
 1906         total = 0;
 1907         for (i = 0; i <= ksg_maxid; i++)
 1908                 total += KSEQ_GROUP(i)->ksg_load;
 1909         return (total);
 1910 #else
 1911         return (KSEQ_SELF()->ksq_sysload);
 1912 #endif
 1913 }
 1914 
 1915 int
 1916 sched_sizeof_ksegrp(void)
 1917 {
 1918         return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 1919 }
 1920 
 1921 int
 1922 sched_sizeof_proc(void)
 1923 {
 1924         return (sizeof(struct proc));
 1925 }
 1926 
 1927 int
 1928 sched_sizeof_thread(void)
 1929 {
 1930         return (sizeof(struct thread) + sizeof(struct td_sched));
 1931 }
 1932 #define KERN_SWITCH_INCLUDE 1
 1933 #include "kern/kern_switch.c"
Cache object: de97af17266e660d20df4a97f9fc9ddc
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/sched_ule.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/sched_ule.c