usched_dfly.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
    3  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to The DragonFly Project
    6  * by Matthew Dillon <dillon@backplane.com>,
    7  * by Mihai Carabas <mihai.carabas@gmail.com>
    8  * and many others.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  *
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in
   18  *    the documentation and/or other materials provided with the
   19  *    distribution.
   20  * 3. Neither the name of The DragonFly Project nor the names of its
   21  *    contributors may be used to endorse or promote products derived
   22  *    from this software without specific, prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  */
   37 #include <sys/param.h>
   38 #include <sys/systm.h>
   39 #include <sys/kernel.h>
   40 #include <sys/lock.h>
   41 #include <sys/queue.h>
   42 #include <sys/proc.h>
   43 #include <sys/rtprio.h>
   44 #include <sys/uio.h>
   45 #include <sys/sysctl.h>
   46 #include <sys/resourcevar.h>
   47 #include <sys/spinlock.h>
   48 #include <sys/cpu_topology.h>
   49 #include <sys/thread2.h>
   50 #include <sys/spinlock2.h>
   51 #include <sys/mplock2.h>
   52 
   53 #include <sys/ktr.h>
   54 
   55 #include <machine/cpu.h>
   56 #include <machine/smp.h>
   57 
   58 /*
   59  * Priorities.  Note that with 32 run queues per scheduler each queue
   60  * represents four priority levels.
   61  */
   62 
   63 int dfly_rebalanced;
   64 
   65 #define MAXPRI                  128
   66 #define PRIMASK                 (MAXPRI - 1)
   67 #define PRIBASE_REALTIME        0
   68 #define PRIBASE_NORMAL          MAXPRI
   69 #define PRIBASE_IDLE            (MAXPRI * 2)
   70 #define PRIBASE_THREAD          (MAXPRI * 3)
   71 #define PRIBASE_NULL            (MAXPRI * 4)
   72 
   73 #define NQS     32                      /* 32 run queues. */
   74 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
   75 #define PPQMASK (PPQ - 1)
   76 
   77 /*
   78  * NICEPPQ      - number of nice units per priority queue
   79  * ESTCPUPPQ    - number of estcpu units per priority queue
   80  * ESTCPUMAX    - number of estcpu units
   81  */
   82 #define NICEPPQ         2
   83 #define ESTCPUPPQ       512
   84 #define ESTCPUMAX       (ESTCPUPPQ * NQS)
   85 #define BATCHMAX        (ESTCPUFREQ * 30)
   86 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
   87 
   88 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
   89 
   90 TAILQ_HEAD(rq, lwp);
   91 
   92 #define lwp_priority    lwp_usdata.dfly.priority
   93 #define lwp_forked      lwp_usdata.dfly.forked
   94 #define lwp_rqindex     lwp_usdata.dfly.rqindex
   95 #define lwp_estcpu      lwp_usdata.dfly.estcpu
   96 #define lwp_estfast     lwp_usdata.dfly.estfast
   97 #define lwp_uload       lwp_usdata.dfly.uload
   98 #define lwp_rqtype      lwp_usdata.dfly.rqtype
   99 #define lwp_qcpu        lwp_usdata.dfly.qcpu
  100 #define lwp_rrcount     lwp_usdata.dfly.rrcount
  101 
  102 struct usched_dfly_pcpu {
  103         struct spinlock spin;
  104         struct thread   helper_thread;
  105         short           unusde01;
  106         short           upri;
  107         int             uload;
  108         int             ucount;
  109         struct lwp      *uschedcp;
  110         struct rq       queues[NQS];
  111         struct rq       rtqueues[NQS];
  112         struct rq       idqueues[NQS];
  113         u_int32_t       queuebits;
  114         u_int32_t       rtqueuebits;
  115         u_int32_t       idqueuebits;
  116         int             runqcount;
  117         int             cpuid;
  118         cpumask_t       cpumask;
  119         cpu_node_t      *cpunode;
  120 };
  121 
  122 typedef struct usched_dfly_pcpu *dfly_pcpu_t;
  123 
  124 static void dfly_acquire_curproc(struct lwp *lp);
  125 static void dfly_release_curproc(struct lwp *lp);
  126 static void dfly_select_curproc(globaldata_t gd);
  127 static void dfly_setrunqueue(struct lwp *lp);
  128 static void dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp);
  129 static void dfly_schedulerclock(struct lwp *lp, sysclock_t period,
  130                                 sysclock_t cpstamp);
  131 static void dfly_recalculate_estcpu(struct lwp *lp);
  132 static void dfly_resetpriority(struct lwp *lp);
  133 static void dfly_forking(struct lwp *plp, struct lwp *lp);
  134 static void dfly_exiting(struct lwp *lp, struct proc *);
  135 static void dfly_uload_update(struct lwp *lp);
  136 static void dfly_yield(struct lwp *lp);
  137 static void dfly_changeqcpu_locked(struct lwp *lp,
  138                                 dfly_pcpu_t dd, dfly_pcpu_t rdd);
  139 static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp);
  140 static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd);
  141 static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp);
  142 static void dfly_need_user_resched_remote(void *dummy);
  143 static struct lwp *dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
  144                                           struct lwp *chklp, int worst);
  145 static void dfly_remrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
  146 static void dfly_setrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
  147 static void dfly_changedcpu(struct lwp *lp);
  148 
  149 struct usched usched_dfly = {
  150         { NULL },
  151         "dfly", "Original DragonFly Scheduler",
  152         NULL,                   /* default registration */
  153         NULL,                   /* default deregistration */
  154         dfly_acquire_curproc,
  155         dfly_release_curproc,
  156         dfly_setrunqueue,
  157         dfly_schedulerclock,
  158         dfly_recalculate_estcpu,
  159         dfly_resetpriority,
  160         dfly_forking,
  161         dfly_exiting,
  162         dfly_uload_update,
  163         NULL,                   /* setcpumask not supported */
  164         dfly_yield,
  165         dfly_changedcpu
  166 };
  167 
  168 /*
  169  * We have NQS (32) run queues per scheduling class.  For the normal
  170  * class, there are 128 priorities scaled onto these 32 queues.  New
  171  * processes are added to the last entry in each queue, and processes
  172  * are selected for running by taking them from the head and maintaining
  173  * a simple FIFO arrangement.  Realtime and Idle priority processes have
  174  * and explicit 0-31 priority which maps directly onto their class queue
  175  * index.  When a queue has something in it, the corresponding bit is
  176  * set in the queuebits variable, allowing a single read to determine
  177  * the state of all 32 queues and then a ffs() to find the first busy
  178  * queue.
  179  */
  180 static cpumask_t dfly_curprocmask = -1; /* currently running a user process */
  181 static cpumask_t dfly_rdyprocmask;      /* ready to accept a user process */
  182 static volatile int dfly_scancpu;
  183 static volatile int dfly_ucount;        /* total running on whole system */
  184 static struct usched_dfly_pcpu dfly_pcpu[MAXCPU];
  185 static struct sysctl_ctx_list usched_dfly_sysctl_ctx;
  186 static struct sysctl_oid *usched_dfly_sysctl_tree;
  187 
  188 /* Debug info exposed through debug.* sysctl */
  189 
  190 static int usched_dfly_debug = -1;
  191 SYSCTL_INT(_debug, OID_AUTO, dfly_scdebug, CTLFLAG_RW,
  192            &usched_dfly_debug, 0,
  193            "Print debug information for this pid");
  194 
  195 static int usched_dfly_pid_debug = -1;
  196 SYSCTL_INT(_debug, OID_AUTO, dfly_pid_debug, CTLFLAG_RW,
  197            &usched_dfly_pid_debug, 0,
  198            "Print KTR debug information for this pid");
  199 
  200 static int usched_dfly_chooser = 0;
  201 SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
  202            &usched_dfly_chooser, 0,
  203            "Print KTR debug information for this pid");
  204 
  205 /*
  206  * Tunning usched_dfly - configurable through kern.usched_dfly.
  207  *
  208  * weight1 - Tries to keep threads on their current cpu.  If you
  209  *           make this value too large the scheduler will not be
  210  *           able to load-balance large loads.
  211  *
  212  * weight2 - If non-zero, detects thread pairs undergoing synchronous
  213  *           communications and tries to move them closer together.
  214  *           Behavior is adjusted by bit 4 of features (0x10).
  215  *
  216  *           WARNING!  Weight2 is a ridiculously sensitive parameter,
  217  *           a small value is recommended.
  218  *
  219  * weight3 - Weighting based on the number of recently runnable threads
  220  *           on the userland scheduling queue (ignoring their loads).
  221  *           A nominal value here prevents high-priority (low-load)
  222  *           threads from accumulating on one cpu core when other
  223  *           cores are available.
  224  *
  225  *           This value should be left fairly small relative to weight1
  226  *           and weight4.
  227  *
  228  * weight4 - Weighting based on other cpu queues being available
  229  *           or running processes with higher lwp_priority's.
  230  *
  231  *           This allows a thread to migrate to another nearby cpu if it
  232  *           is unable to run on the current cpu based on the other cpu
  233  *           being idle or running a lower priority (higher lwp_priority)
  234  *           thread.  This value should be large enough to override weight1
  235  *
  236  * features - These flags can be set or cleared to enable or disable various
  237  *            features.
  238  *
  239  *            0x01      Enable idle-cpu pulling                 (default)
  240  *            0x02      Enable proactive pushing                (default)
  241  *            0x04      Enable rebalancing rover                (default)
  242  *            0x08      Enable more proactive pushing           (default)
  243  *            0x10      (flip weight2 limit on same cpu)        (default)
  244  *            0x20      choose best cpu for forked process
  245  *            0x40      choose current cpu for forked process
  246  *            0x80      choose random cpu for forked process    (default)
  247  */
  248 static int usched_dfly_smt = 0;
  249 static int usched_dfly_cache_coherent = 0;
  250 static int usched_dfly_weight1 = 200;   /* keep thread on current cpu */
  251 static int usched_dfly_weight2 = 180;   /* synchronous peer's current cpu */
  252 static int usched_dfly_weight3 = 40;    /* number of threads on queue */
  253 static int usched_dfly_weight4 = 160;   /* availability of idle cores */
  254 static int usched_dfly_features = 0x8F; /* allow pulls */
  255 static int usched_dfly_fast_resched = 0;/* delta priority / resched */
  256 static int usched_dfly_swmask = ~PPQMASK; /* allow pulls */
  257 static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
  258 static int usched_dfly_decay = 8;
  259 
  260 /* KTR debug printings */
  261 
  262 KTR_INFO_MASTER(usched);
  263 
  264 #if !defined(KTR_USCHED_DFLY)
  265 #define KTR_USCHED_DFLY KTR_ALL
  266 #endif
  267 
  268 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc, 0,
  269     "USCHED_DFLY(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
  270     pid_t pid, int old_cpuid, int curr);
  271 
  272 /*
  273  * This function is called when the kernel intends to return to userland.
  274  * It is responsible for making the thread the current designated userland
  275  * thread for this cpu, blocking if necessary.
  276  *
  277  * The kernel will not depress our LWKT priority until after we return,
  278  * in case we have to shove over to another cpu.
  279  *
  280  * We must determine our thread's disposition before we switch away.  This
  281  * is very sensitive code.
  282  *
  283  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
  284  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
  285  * occur, this function is called only under very controlled circumstances.
  286  */
  287 static void
  288 dfly_acquire_curproc(struct lwp *lp)
  289 {
  290         globaldata_t gd;
  291         dfly_pcpu_t dd;
  292         dfly_pcpu_t rdd;
  293         thread_t td;
  294         int force_resched;
  295 
  296         /*
  297          * Make sure we aren't sitting on a tsleep queue.
  298          */
  299         td = lp->lwp_thread;
  300         crit_enter_quick(td);
  301         if (td->td_flags & TDF_TSLEEPQ)
  302                 tsleep_remove(td);
  303         dfly_recalculate_estcpu(lp);
  304 
  305         gd = mycpu;
  306         dd = &dfly_pcpu[gd->gd_cpuid];
  307 
  308         /*
  309          * Process any pending interrupts/ipi's, then handle reschedule
  310          * requests.  dfly_release_curproc() will try to assign a new
  311          * uschedcp that isn't us and otherwise NULL it out.
  312          */
  313         force_resched = 0;
  314         if ((td->td_mpflags & TDF_MP_BATCH_DEMARC) &&
  315             lp->lwp_rrcount >= usched_dfly_rrinterval / 2) {
  316                 force_resched = 1;
  317         }
  318 
  319         if (user_resched_wanted()) {
  320                 if (dd->uschedcp == lp)
  321                         force_resched = 1;
  322                 clear_user_resched();
  323                 dfly_release_curproc(lp);
  324         }
  325 
  326         /*
  327          * Loop until we are the current user thread.
  328          *
  329          * NOTE: dd spinlock not held at top of loop.
  330          */
  331         if (dd->uschedcp == lp)
  332                 lwkt_yield_quick();
  333 
  334         while (dd->uschedcp != lp) {
  335                 lwkt_yield_quick();
  336 
  337                 spin_lock(&dd->spin);
  338 
  339                 if (force_resched &&
  340                    (usched_dfly_features & 0x08) &&
  341                    (rdd = dfly_choose_best_queue(lp)) != dd) {
  342                         /*
  343                          * We are not or are no longer the current lwp and a
  344                          * forced reschedule was requested.  Figure out the
  345                          * best cpu to run on (our current cpu will be given
  346                          * significant weight).
  347                          *
  348                          * (if a reschedule was not requested we want to
  349                          *  move this step after the uschedcp tests).
  350                          */
  351                         dfly_changeqcpu_locked(lp, dd, rdd);
  352                         spin_unlock(&dd->spin);
  353                         lwkt_deschedule(lp->lwp_thread);
  354                         dfly_setrunqueue_dd(rdd, lp);
  355                         lwkt_switch();
  356                         gd = mycpu;
  357                         dd = &dfly_pcpu[gd->gd_cpuid];
  358                         continue;
  359                 }
  360 
  361                 /*
  362                  * Either no reschedule was requested or the best queue was
  363                  * dd, and no current process has been selected.  We can
  364                  * trivially become the current lwp on the current cpu.
  365                  */
  366                 if (dd->uschedcp == NULL) {
  367                         atomic_clear_int(&lp->lwp_thread->td_mpflags,
  368                                          TDF_MP_DIDYIELD);
  369                         atomic_set_cpumask(&dfly_curprocmask, gd->gd_cpumask);
  370                         dd->uschedcp = lp;
  371                         dd->upri = lp->lwp_priority;
  372                         KKASSERT(lp->lwp_qcpu == dd->cpuid);
  373                         spin_unlock(&dd->spin);
  374                         break;
  375                 }
  376 
  377                 /*
  378                  * Put us back on the same run queue unconditionally.
  379                  *
  380                  * Set rrinterval to force placement at end of queue.
  381                  * Select the worst queue to ensure we round-robin,
  382                  * but do not change estcpu.
  383                  */
  384                 if (lp->lwp_thread->td_mpflags & TDF_MP_DIDYIELD) {
  385                         u_int32_t tsqbits;
  386 
  387                         switch(lp->lwp_rqtype) {
  388                         case RTP_PRIO_NORMAL:
  389                                 tsqbits = dd->queuebits;
  390                                 spin_unlock(&dd->spin);
  391 
  392                                 lp->lwp_rrcount = usched_dfly_rrinterval;
  393                                 if (tsqbits)
  394                                         lp->lwp_rqindex = bsrl(tsqbits);
  395                                 break;
  396                         default:
  397                                 spin_unlock(&dd->spin);
  398                                 break;
  399                         }
  400                         lwkt_deschedule(lp->lwp_thread);
  401                         dfly_setrunqueue_dd(dd, lp);
  402                         atomic_clear_int(&lp->lwp_thread->td_mpflags,
  403                                          TDF_MP_DIDYIELD);
  404                         lwkt_switch();
  405                         gd = mycpu;
  406                         dd = &dfly_pcpu[gd->gd_cpuid];
  407                         continue;
  408                 }
  409 
  410                 /*
  411                  * Can we steal the current designated user thread?
  412                  *
  413                  * If we do the other thread will stall when it tries to
  414                  * return to userland, possibly rescheduling elsewhere.
  415                  *
  416                  * It is important to do a masked test to avoid the edge
  417                  * case where two near-equal-priority threads are constantly
  418                  * interrupting each other.
  419                  *
  420                  * In the exact match case another thread has already gained
  421                  * uschedcp and lowered its priority, if we steal it the
  422                  * other thread will stay stuck on the LWKT runq and not
  423                  * push to another cpu.  So don't steal on equal-priority even
  424                  * though it might appear to be more beneficial due to not
  425                  * having to switch back to the other thread's context.
  426                  *
  427                  * usched_dfly_fast_resched requires that two threads be
  428                  * significantly far apart in priority in order to interrupt.
  429                  *
  430                  * If better but not sufficiently far apart, the current
  431                  * uschedcp will be interrupted at the next scheduler clock.
  432                  */
  433                 if (dd->uschedcp &&
  434                    (dd->upri & ~PPQMASK) >
  435                    (lp->lwp_priority & ~PPQMASK) + usched_dfly_fast_resched) {
  436                         dd->uschedcp = lp;
  437                         dd->upri = lp->lwp_priority;
  438                         KKASSERT(lp->lwp_qcpu == dd->cpuid);
  439                         spin_unlock(&dd->spin);
  440                         break;
  441                 }
  442                 /*
  443                  * We are not the current lwp, figure out the best cpu
  444                  * to run on (our current cpu will be given significant
  445                  * weight).  Loop on cpu change.
  446                  */
  447                 if ((usched_dfly_features & 0x02) &&
  448                     force_resched == 0 &&
  449                     (rdd = dfly_choose_best_queue(lp)) != dd) {
  450                         dfly_changeqcpu_locked(lp, dd, rdd);
  451                         spin_unlock(&dd->spin);
  452                         lwkt_deschedule(lp->lwp_thread);
  453                         dfly_setrunqueue_dd(rdd, lp);
  454                         lwkt_switch();
  455                         gd = mycpu;
  456                         dd = &dfly_pcpu[gd->gd_cpuid];
  457                         continue;
  458                 }
  459 
  460                 /*
  461                  * We cannot become the current lwp, place the lp on the
  462                  * run-queue of this or another cpu and deschedule ourselves.
  463                  *
  464                  * When we are reactivated we will have another chance.
  465                  *
  466                  * Reload after a switch or setrunqueue/switch possibly
  467                  * moved us to another cpu.
  468                  */
  469                 spin_unlock(&dd->spin);
  470                 lwkt_deschedule(lp->lwp_thread);
  471                 dfly_setrunqueue_dd(dd, lp);
  472                 lwkt_switch();
  473                 gd = mycpu;
  474                 dd = &dfly_pcpu[gd->gd_cpuid];
  475         }
  476 
  477         /*
  478          * Make sure upri is synchronized, then yield to LWKT threads as
  479          * needed before returning.  This could result in another reschedule.
  480          * XXX
  481          */
  482         crit_exit_quick(td);
  483 
  484         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
  485 }
  486 
  487 /*
  488  * DFLY_RELEASE_CURPROC
  489  *
  490  * This routine detaches the current thread from the userland scheduler,
  491  * usually because the thread needs to run or block in the kernel (at
  492  * kernel priority) for a while.
  493  *
  494  * This routine is also responsible for selecting a new thread to
  495  * make the current thread.
  496  *
  497  * NOTE: This implementation differs from the dummy example in that
  498  * dfly_select_curproc() is able to select the current process, whereas
  499  * dummy_select_curproc() is not able to select the current process.
  500  * This means we have to NULL out uschedcp.
  501  *
  502  * Additionally, note that we may already be on a run queue if releasing
  503  * via the lwkt_switch() in dfly_setrunqueue().
  504  */
  505 static void
  506 dfly_release_curproc(struct lwp *lp)
  507 {
  508         globaldata_t gd = mycpu;
  509         dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
  510 
  511         /*
  512          * Make sure td_wakefromcpu is defaulted.  This will be overwritten
  513          * by wakeup().
  514          */
  515         if (dd->uschedcp == lp) {
  516                 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
  517                 spin_lock(&dd->spin);
  518                 if (dd->uschedcp == lp) {
  519                         dd->uschedcp = NULL;    /* don't let lp be selected */
  520                         dd->upri = PRIBASE_NULL;
  521                         atomic_clear_cpumask(&dfly_curprocmask, gd->gd_cpumask);
  522                         spin_unlock(&dd->spin);
  523                         dfly_select_curproc(gd);
  524                 } else {
  525                         spin_unlock(&dd->spin);
  526                 }
  527         }
  528 }
  529 
  530 /*
  531  * DFLY_SELECT_CURPROC
  532  *
  533  * Select a new current process for this cpu and clear any pending user
  534  * reschedule request.  The cpu currently has no current process.
  535  *
  536  * This routine is also responsible for equal-priority round-robining,
  537  * typically triggered from dfly_schedulerclock().  In our dummy example
  538  * all the 'user' threads are LWKT scheduled all at once and we just
  539  * call lwkt_switch().
  540  *
  541  * The calling process is not on the queue and cannot be selected.
  542  */
  543 static
  544 void
  545 dfly_select_curproc(globaldata_t gd)
  546 {
  547         dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
  548         struct lwp *nlp;
  549         int cpuid = gd->gd_cpuid;
  550 
  551         crit_enter_gd(gd);
  552 
  553         spin_lock(&dd->spin);
  554         nlp = dfly_chooseproc_locked(dd, dd, dd->uschedcp, 0);
  555 
  556         if (nlp) {
  557                 atomic_set_cpumask(&dfly_curprocmask, CPUMASK(cpuid));
  558                 dd->upri = nlp->lwp_priority;
  559                 dd->uschedcp = nlp;
  560 #if 0
  561                 dd->rrcount = 0;                /* reset round robin */
  562 #endif
  563                 spin_unlock(&dd->spin);
  564                 lwkt_acquire(nlp->lwp_thread);
  565                 lwkt_schedule(nlp->lwp_thread);
  566         } else {
  567                 spin_unlock(&dd->spin);
  568         }
  569         crit_exit_gd(gd);
  570 }
  571 
  572 /*
  573  * Place the specified lwp on the user scheduler's run queue.  This routine
  574  * must be called with the thread descheduled.  The lwp must be runnable.
  575  * It must not be possible for anyone else to explicitly schedule this thread.
  576  *
  577  * The thread may be the current thread as a special case.
  578  */
  579 static void
  580 dfly_setrunqueue(struct lwp *lp)
  581 {
  582         dfly_pcpu_t dd;
  583         dfly_pcpu_t rdd;
  584 
  585         /*
  586          * First validate the process LWKT state.
  587          */
  588         KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
  589         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
  590             ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
  591              lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
  592         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
  593 
  594         /*
  595          * NOTE: dd/rdd do not necessarily represent the current cpu.
  596          *       Instead they may represent the cpu the thread was last
  597          *       scheduled on or inherited by its parent.
  598          */
  599         dd = &dfly_pcpu[lp->lwp_qcpu];
  600         rdd = dd;
  601 
  602         /*
  603          * This process is not supposed to be scheduled anywhere or assigned
  604          * as the current process anywhere.  Assert the condition.
  605          */
  606         KKASSERT(rdd->uschedcp != lp);
  607 
  608         /*
  609          * Ok, we have to setrunqueue some target cpu and request a reschedule
  610          * if necessary.
  611          *
  612          * We have to choose the best target cpu.  It might not be the current
  613          * target even if the current cpu has no running user thread (for
  614          * example, because the current cpu might be a hyperthread and its
  615          * sibling has a thread assigned).
  616          *
  617          * If we just forked it is most optimal to run the child on the same
  618          * cpu just in case the parent decides to wait for it (thus getting
  619          * off that cpu).  As long as there is nothing else runnable on the
  620          * cpu, that is.  If we did this unconditionally a parent forking
  621          * multiple children before waiting (e.g. make -j N) leaves other
  622          * cpus idle that could be working.
  623          */
  624         if (lp->lwp_forked) {
  625                 lp->lwp_forked = 0;
  626                 if (usched_dfly_features & 0x20)
  627                         rdd = dfly_choose_best_queue(lp);
  628                 else if (usched_dfly_features & 0x40)
  629                         rdd = &dfly_pcpu[lp->lwp_qcpu];
  630                 else if (usched_dfly_features & 0x80)
  631                         rdd = dfly_choose_queue_simple(rdd, lp);
  632                 else if (dfly_pcpu[lp->lwp_qcpu].runqcount)
  633                         rdd = dfly_choose_best_queue(lp);
  634                 else
  635                         rdd = &dfly_pcpu[lp->lwp_qcpu];
  636         } else {
  637                 rdd = dfly_choose_best_queue(lp);
  638                 /* rdd = &dfly_pcpu[lp->lwp_qcpu]; */
  639         }
  640         if (lp->lwp_qcpu != rdd->cpuid) {
  641                 spin_lock(&dd->spin);
  642                 dfly_changeqcpu_locked(lp, dd, rdd);
  643                 spin_unlock(&dd->spin);
  644         }
  645         dfly_setrunqueue_dd(rdd, lp);
  646 }
  647 
  648 /*
  649  * Change qcpu to rdd->cpuid.  The dd the lp is CURRENTLY on must be
  650  * spin-locked on-call.  rdd does not have to be.
  651  */
  652 static void
  653 dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd)
  654 {
  655         if (lp->lwp_qcpu != rdd->cpuid) {
  656                 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
  657                         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
  658                         atomic_add_int(&dd->uload, -lp->lwp_uload);
  659                         atomic_add_int(&dd->ucount, -1);
  660                         atomic_add_int(&dfly_ucount, -1);
  661                 }
  662                 lp->lwp_qcpu = rdd->cpuid;
  663         }
  664 }
  665 
  666 /*
  667  * Place lp on rdd's runqueue.  Nothing is locked on call.  This function
  668  * also performs all necessary ancillary notification actions.
  669  */
  670 static void
  671 dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp)
  672 {
  673         globaldata_t rgd;
  674 
  675         /*
  676          * We might be moving the lp to another cpu's run queue, and once
  677          * on the runqueue (even if it is our cpu's), another cpu can rip
  678          * it away from us.
  679          *
  680          * TDF_MIGRATING might already be set if this is part of a
  681          * remrunqueue+setrunqueue sequence.
  682          */
  683         if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
  684                 lwkt_giveaway(lp->lwp_thread);
  685 
  686         rgd = globaldata_find(rdd->cpuid);
  687 
  688         /*
  689          * We lose control of the lp the moment we release the spinlock
  690          * after having placed it on the queue.  i.e. another cpu could pick
  691          * it up, or it could exit, or its priority could be further
  692          * adjusted, or something like that.
  693          *
  694          * WARNING! rdd can point to a foreign cpu!
  695          */
  696         spin_lock(&rdd->spin);
  697         dfly_setrunqueue_locked(rdd, lp);
  698 
  699         /*
  700          * Potentially interrupt the currently-running thread
  701          */
  702         if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK)) {
  703                 /*
  704                  * Currently running thread is better or same, do not
  705                  * interrupt.
  706                  */
  707                 spin_unlock(&rdd->spin);
  708         } else if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK) +
  709                    usched_dfly_fast_resched) {
  710                 /*
  711                  * Currently running thread is not better, but not so bad
  712                  * that we need to interrupt it.  Let it run for one more
  713                  * scheduler tick.
  714                  */
  715                 if (rdd->uschedcp &&
  716                     rdd->uschedcp->lwp_rrcount < usched_dfly_rrinterval) {
  717                         rdd->uschedcp->lwp_rrcount = usched_dfly_rrinterval - 1;
  718                 }
  719                 spin_unlock(&rdd->spin);
  720         } else if (rgd == mycpu) {
  721                 /*
  722                  * We should interrupt the currently running thread, which
  723                  * is on the current cpu.  However, if DIDYIELD is set we
  724                  * round-robin unconditionally and do not interrupt it.
  725                  */
  726                 spin_unlock(&rdd->spin);
  727                 if (rdd->uschedcp == NULL)
  728                         wakeup_mycpu(&rdd->helper_thread); /* XXX */
  729                 if ((lp->lwp_thread->td_mpflags & TDF_MP_DIDYIELD) == 0)
  730                         need_user_resched();
  731         } else {
  732                 /*
  733                  * We should interrupt the currently running thread, which
  734                  * is on a different cpu.
  735                  */
  736                 spin_unlock(&rdd->spin);
  737                 lwkt_send_ipiq(rgd, dfly_need_user_resched_remote, NULL);
  738         }
  739 }
  740 
  741 /*
  742  * This routine is called from a systimer IPI.  It MUST be MP-safe and
  743  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
  744  * each cpu.
  745  */
  746 static
  747 void
  748 dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
  749 {
  750         globaldata_t gd = mycpu;
  751         dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
  752 
  753         /*
  754          * Spinlocks also hold a critical section so there should not be
  755          * any active.
  756          */
  757         KKASSERT(gd->gd_spinlocks == 0);
  758 
  759         if (lp == NULL)
  760                 return;
  761 
  762         /*
  763          * Do we need to round-robin?  We round-robin 10 times a second.
  764          * This should only occur for cpu-bound batch processes.
  765          */
  766         if (++lp->lwp_rrcount >= usched_dfly_rrinterval) {
  767                 lp->lwp_thread->td_wakefromcpu = -1;
  768                 need_user_resched();
  769         }
  770 
  771         /*
  772          * Adjust estcpu upward using a real time equivalent calculation,
  773          * and recalculate lp's priority.
  774          */
  775         lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
  776         dfly_resetpriority(lp);
  777 
  778         /*
  779          * Rebalance two cpus every 8 ticks, pulling the worst thread
  780          * from the worst cpu's queue into a rotating cpu number.
  781          *
  782          * This mechanic is needed because the push algorithms can
  783          * steady-state in an non-optimal configuration.  We need to mix it
  784          * up a little, even if it means breaking up a paired thread, so
  785          * the push algorithms can rebalance the degenerate conditions.
  786          * This portion of the algorithm exists to ensure stability at the
  787          * selected weightings.
  788          *
  789          * Because we might be breaking up optimal conditions we do not want
  790          * to execute this too quickly, hence we only rebalance approximately
  791          * ~7-8 times per second.  The push's, on the otherhand, are capable
  792          * moving threads to other cpus at a much higher rate.
  793          *
  794          * We choose the most heavily loaded thread from the worst queue
  795          * in order to ensure that multiple heavy-weight threads on the same
  796          * queue get broken up, and also because these threads are the most
  797          * likely to be able to remain in place.  Hopefully then any pairings,
  798          * if applicable, migrate to where these threads are.
  799          */
  800         if ((usched_dfly_features & 0x04) &&
  801             ((u_int)sched_ticks & 7) == 0 &&
  802             (u_int)sched_ticks / 8 % ncpus == gd->gd_cpuid) {
  803                 /*
  804                  * Our cpu is up.
  805                  */
  806                 struct lwp *nlp;
  807                 dfly_pcpu_t rdd;
  808 
  809                 rdd = dfly_choose_worst_queue(dd);
  810                 if (rdd) {
  811                         spin_lock(&dd->spin);
  812                         if (spin_trylock(&rdd->spin)) {
  813                                 nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
  814                                 spin_unlock(&rdd->spin);
  815                                 if (nlp == NULL)
  816                                         spin_unlock(&dd->spin);
  817                         } else {
  818                                 spin_unlock(&dd->spin);
  819                                 nlp = NULL;
  820                         }
  821                 } else {
  822                         nlp = NULL;
  823                 }
  824                 /* dd->spin held if nlp != NULL */
  825 
  826                 /*
  827                  * Either schedule it or add it to our queue.
  828                  */
  829                 if (nlp &&
  830                     (nlp->lwp_priority & ~PPQMASK) < (dd->upri & ~PPQMASK)) {
  831                         atomic_set_cpumask(&dfly_curprocmask, dd->cpumask);
  832                         dd->upri = nlp->lwp_priority;
  833                         dd->uschedcp = nlp;
  834 #if 0
  835                         dd->rrcount = 0;        /* reset round robin */
  836 #endif
  837                         spin_unlock(&dd->spin);
  838                         lwkt_acquire(nlp->lwp_thread);
  839                         lwkt_schedule(nlp->lwp_thread);
  840                 } else if (nlp) {
  841                         dfly_setrunqueue_locked(dd, nlp);
  842                         spin_unlock(&dd->spin);
  843                 }
  844         }
  845 }
  846 
  847 /*
  848  * Called from acquire and from kern_synch's one-second timer (one of the
  849  * callout helper threads) with a critical section held.
  850  *
  851  * Adjust p_estcpu based on our single-cpu load, p_nice, and compensate for
  852  * overall system load.
  853  *
  854  * Note that no recalculation occurs for a process which sleeps and wakes
  855  * up in the same tick.  That is, a system doing thousands of context
  856  * switches per second will still only do serious estcpu calculations
  857  * ESTCPUFREQ times per second.
  858  */
  859 static
  860 void
  861 dfly_recalculate_estcpu(struct lwp *lp)
  862 {
  863         globaldata_t gd = mycpu;
  864         sysclock_t cpbase;
  865         sysclock_t ttlticks;
  866         int estcpu;
  867         int decay_factor;
  868         int ucount;
  869 
  870         /*
  871          * We have to subtract periodic to get the last schedclock
  872          * timeout time, otherwise we would get the upcoming timeout.
  873          * Keep in mind that a process can migrate between cpus and
  874          * while the scheduler clock should be very close, boundary
  875          * conditions could lead to a small negative delta.
  876          */
  877         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
  878 
  879         if (lp->lwp_slptime > 1) {
  880                 /*
  881                  * Too much time has passed, do a coarse correction.
  882                  */
  883                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
  884                 dfly_resetpriority(lp);
  885                 lp->lwp_cpbase = cpbase;
  886                 lp->lwp_cpticks = 0;
  887                 lp->lwp_estfast = 0;
  888         } else if (lp->lwp_cpbase != cpbase) {
  889                 /*
  890                  * Adjust estcpu if we are in a different tick.  Don't waste
  891                  * time if we are in the same tick.
  892                  *
  893                  * First calculate the number of ticks in the measurement
  894                  * interval.  The ttlticks calculation can wind up 0 due to
  895                  * a bug in the handling of lwp_slptime  (as yet not found),
  896                  * so make sure we do not get a divide by 0 panic.
  897                  */
  898                 ttlticks = (cpbase - lp->lwp_cpbase) /
  899                            gd->gd_schedclock.periodic;
  900                 if ((ssysclock_t)ttlticks < 0) {
  901                         ttlticks = 0;
  902                         lp->lwp_cpbase = cpbase;
  903                 }
  904                 if (ttlticks == 0)
  905                         return;
  906                 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
  907 
  908                 /*
  909                  * Calculate the percentage of one cpu being used then
  910                  * compensate for any system load in excess of ncpus.
  911                  *
  912                  * For example, if we have 8 cores and 16 running cpu-bound
  913                  * processes then all things being equal each process will
  914                  * get 50% of one cpu.  We need to pump this value back
  915                  * up to 100% so the estcpu calculation properly adjusts
  916                  * the process's dynamic priority.
  917                  *
  918                  * estcpu is scaled by ESTCPUMAX, pctcpu is scaled by FSCALE.
  919                  */
  920                 estcpu = (lp->lwp_pctcpu * ESTCPUMAX) >> FSHIFT;
  921                 ucount = dfly_ucount;
  922                 if (ucount > ncpus) {
  923                         estcpu += estcpu * (ucount - ncpus) / ncpus;
  924                 }
  925 
  926                 if (usched_dfly_debug == lp->lwp_proc->p_pid) {
  927                         kprintf("pid %d lwp %p estcpu %3d %3d cp %d/%d",
  928                                 lp->lwp_proc->p_pid, lp,
  929                                 estcpu, lp->lwp_estcpu,
  930                                 lp->lwp_cpticks, ttlticks);
  931                 }
  932 
  933                 /*
  934                  * Adjust lp->lwp_esetcpu.  The decay factor determines how
  935                  * quickly lwp_estcpu collapses to its realtime calculation.
  936                  * A slower collapse gives us a more accurate number over
  937                  * the long term but can create problems with bursty threads
  938                  * or threads which become cpu hogs.
  939                  *
  940                  * To solve this problem, newly started lwps and lwps which
  941                  * are restarting after having been asleep for a while are
  942                  * given a much, much faster decay in order to quickly
  943                  * detect whether they become cpu-bound.
  944                  *
  945                  * NOTE: p_nice is accounted for in dfly_resetpriority(),
  946                  *       and not here, but we must still ensure that a
  947                  *       cpu-bound nice -20 process does not completely
  948                  *       override a cpu-bound nice +20 process.
  949                  *
  950                  * NOTE: We must use ESTCPULIM() here to deal with any
  951                  *       overshoot.
  952                  */
  953                 decay_factor = usched_dfly_decay;
  954                 if (decay_factor < 1)
  955                         decay_factor = 1;
  956                 if (decay_factor > 1024)
  957                         decay_factor = 1024;
  958 
  959                 if (lp->lwp_estfast < usched_dfly_decay) {
  960                         ++lp->lwp_estfast;
  961                         lp->lwp_estcpu = ESTCPULIM(
  962                                 (lp->lwp_estcpu * lp->lwp_estfast + estcpu) /
  963                                 (lp->lwp_estfast + 1));
  964                 } else {
  965                         lp->lwp_estcpu = ESTCPULIM(
  966                                 (lp->lwp_estcpu * decay_factor + estcpu) /
  967                                 (decay_factor + 1));
  968                 }
  969 
  970                 if (usched_dfly_debug == lp->lwp_proc->p_pid)
  971                         kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
  972                 dfly_resetpriority(lp);
  973                 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
  974                 lp->lwp_cpticks = 0;
  975         }
  976 }
  977 
  978 /*
  979  * Compute the priority of a process when running in user mode.
  980  * Arrange to reschedule if the resulting priority is better
  981  * than that of the current process.
  982  *
  983  * This routine may be called with any process.
  984  *
  985  * This routine is called by fork1() for initial setup with the process
  986  * of the run queue, and also may be called normally with the process on or
  987  * off the run queue.
  988  */
  989 static void
  990 dfly_resetpriority(struct lwp *lp)
  991 {
  992         dfly_pcpu_t rdd;
  993         int newpriority;
  994         u_short newrqtype;
  995         int rcpu;
  996         int checkpri;
  997         int estcpu;
  998         int delta_uload;
  999 
 1000         crit_enter();
 1001 
 1002         /*
 1003          * Lock the scheduler (lp) belongs to.  This can be on a different
 1004          * cpu.  Handle races.  This loop breaks out with the appropriate
 1005          * rdd locked.
 1006          */
 1007         for (;;) {
 1008                 rcpu = lp->lwp_qcpu;
 1009                 cpu_ccfence();
 1010                 rdd = &dfly_pcpu[rcpu];
 1011                 spin_lock(&rdd->spin);
 1012                 if (rcpu == lp->lwp_qcpu)
 1013                         break;
 1014                 spin_unlock(&rdd->spin);
 1015         }
 1016 
 1017         /*
 1018          * Calculate the new priority and queue type
 1019          */
 1020         newrqtype = lp->lwp_rtprio.type;
 1021 
 1022         switch(newrqtype) {
 1023         case RTP_PRIO_REALTIME:
 1024         case RTP_PRIO_FIFO:
 1025                 newpriority = PRIBASE_REALTIME +
 1026                              (lp->lwp_rtprio.prio & PRIMASK);
 1027                 break;
 1028         case RTP_PRIO_NORMAL:
 1029                 /*
 1030                  *
 1031                  */
 1032                 estcpu = lp->lwp_estcpu;
 1033 
 1034                 /*
 1035                  * p_nice piece         Adds (0-40) * 2         0-80
 1036                  * estcpu               Adds 16384  * 4 / 512   0-128
 1037                  */
 1038                 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
 1039                 newpriority += estcpu * PPQ / ESTCPUPPQ;
 1040                 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
 1041                               NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
 1042                 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
 1043                 break;
 1044         case RTP_PRIO_IDLE:
 1045                 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
 1046                 break;
 1047         case RTP_PRIO_THREAD:
 1048                 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
 1049                 break;
 1050         default:
 1051                 panic("Bad RTP_PRIO %d", newrqtype);
 1052                 /* NOT REACHED */
 1053         }
 1054 
 1055         /*
 1056          * The LWKT scheduler doesn't dive usched structures, give it a hint
 1057          * on the relative priority of user threads running in the kernel.
 1058          * The LWKT scheduler will always ensure that a user thread running
 1059          * in the kernel will get cpu some time, regardless of its upri,
 1060          * but can decide not to instantly switch from one kernel or user
 1061          * mode user thread to a kernel-mode user thread when it has a less
 1062          * desireable user priority.
 1063          *
 1064          * td_upri has normal sense (higher values are more desireable), so
 1065          * negate it.
 1066          */
 1067         lp->lwp_thread->td_upri = -(newpriority & usched_dfly_swmask);
 1068 
 1069         /*
 1070          * The newpriority incorporates the queue type so do a simple masked
 1071          * check to determine if the process has moved to another queue.  If
 1072          * it has, and it is currently on a run queue, then move it.
 1073          *
 1074          * Since uload is ~PPQMASK masked, no modifications are necessary if
 1075          * we end up in the same run queue.
 1076          */
 1077         if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
 1078                 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
 1079                         dfly_remrunqueue_locked(rdd, lp);
 1080                         lp->lwp_priority = newpriority;
 1081                         lp->lwp_rqtype = newrqtype;
 1082                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
 1083                         dfly_setrunqueue_locked(rdd, lp);
 1084                         checkpri = 1;
 1085                 } else {
 1086                         lp->lwp_priority = newpriority;
 1087                         lp->lwp_rqtype = newrqtype;
 1088                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
 1089                         checkpri = 0;
 1090                 }
 1091         } else {
 1092                 /*
 1093                  * In the same PPQ, uload cannot change.
 1094                  */
 1095                 lp->lwp_priority = newpriority;
 1096                 checkpri = 1;
 1097                 rcpu = -1;
 1098         }
 1099 
 1100         /*
 1101          * Adjust effective load.
 1102          *
 1103          * Calculate load then scale up or down geometrically based on p_nice.
 1104          * Processes niced up (positive) are less important, and processes
 1105          * niced downard (negative) are more important.  The higher the uload,
 1106          * the more important the thread.
 1107          */
 1108         /* 0-511, 0-100% cpu */
 1109         delta_uload = lp->lwp_estcpu / NQS;
 1110         delta_uload -= delta_uload * lp->lwp_proc->p_nice / (PRIO_MAX + 1);
 1111 
 1112 
 1113         delta_uload -= lp->lwp_uload;
 1114         lp->lwp_uload += delta_uload;
 1115         if (lp->lwp_mpflags & LWP_MP_ULOAD)
 1116                 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, delta_uload);
 1117 
 1118         /*
 1119          * Determine if we need to reschedule the target cpu.  This only
 1120          * occurs if the LWP is already on a scheduler queue, which means
 1121          * that idle cpu notification has already occured.  At most we
 1122          * need only issue a need_user_resched() on the appropriate cpu.
 1123          *
 1124          * The LWP may be owned by a CPU different from the current one,
 1125          * in which case dd->uschedcp may be modified without an MP lock
 1126          * or a spinlock held.  The worst that happens is that the code
 1127          * below causes a spurious need_user_resched() on the target CPU
 1128          * and dd->pri to be wrong for a short period of time, both of
 1129          * which are harmless.
 1130          *
 1131          * If checkpri is 0 we are adjusting the priority of the current
 1132          * process, possibly higher (less desireable), so ignore the upri
 1133          * check which will fail in that case.
 1134          */
 1135         if (rcpu >= 0) {
 1136                 if ((dfly_rdyprocmask & CPUMASK(rcpu)) &&
 1137                     (checkpri == 0 ||
 1138                      (rdd->upri & ~PRIMASK) >
 1139                      (lp->lwp_priority & ~PRIMASK))) {
 1140                         if (rcpu == mycpu->gd_cpuid) {
 1141                                 spin_unlock(&rdd->spin);
 1142                                 need_user_resched();
 1143                         } else {
 1144                                 spin_unlock(&rdd->spin);
 1145                                 lwkt_send_ipiq(globaldata_find(rcpu),
 1146                                                dfly_need_user_resched_remote,
 1147                                                NULL);
 1148                         }
 1149                 } else {
 1150                         spin_unlock(&rdd->spin);
 1151                 }
 1152         } else {
 1153                 spin_unlock(&rdd->spin);
 1154         }
 1155         crit_exit();
 1156 }
 1157 
 1158 static
 1159 void
 1160 dfly_yield(struct lwp *lp)
 1161 {
 1162         if (lp->lwp_qcpu != mycpu->gd_cpuid)
 1163                 return;
 1164         KKASSERT(lp == curthread->td_lwp);
 1165 
 1166         /*
 1167          * Don't set need_user_resched() or mess with rrcount or anything.
 1168          * the TDF flag will override everything as long as we release.
 1169          */
 1170         atomic_set_int(&lp->lwp_thread->td_mpflags, TDF_MP_DIDYIELD);
 1171         dfly_release_curproc(lp);
 1172 }
 1173 
 1174 /*
 1175  * Thread was forcefully migrated to another cpu.  Normally forced migrations
 1176  * are used for iterations and the kernel returns to the original cpu before
 1177  * returning and this is not needed.  However, if the kernel migrates a
 1178  * thread to another cpu and wants to leave it there, it has to call this
 1179  * scheduler helper.
 1180  *
 1181  * Note that the lwkt_migratecpu() function also released the thread, so
 1182  * we don't have to worry about that.
 1183  */
 1184 static
 1185 void
 1186 dfly_changedcpu(struct lwp *lp)
 1187 {
 1188         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
 1189         dfly_pcpu_t rdd = &dfly_pcpu[mycpu->gd_cpuid];
 1190 
 1191         if (dd != rdd) {
 1192                 spin_lock(&dd->spin);
 1193                 dfly_changeqcpu_locked(lp, dd, rdd);
 1194                 spin_unlock(&dd->spin);
 1195         }
 1196 }
 1197 
 1198 /*
 1199  * Called from fork1() when a new child process is being created.
 1200  *
 1201  * Give the child process an initial estcpu that is more batch then
 1202  * its parent and dock the parent for the fork (but do not
 1203  * reschedule the parent).
 1204  *
 1205  * fast
 1206  *
 1207  * XXX lwp should be "spawning" instead of "forking"
 1208  */
 1209 static void
 1210 dfly_forking(struct lwp *plp, struct lwp *lp)
 1211 {
 1212         /*
 1213          * Put the child 4 queue slots (out of 32) higher than the parent
 1214          * (less desireable than the parent).
 1215          */
 1216         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
 1217         lp->lwp_forked = 1;
 1218         lp->lwp_estfast = 0;
 1219 
 1220         /*
 1221          * Dock the parent a cost for the fork, protecting us from fork
 1222          * bombs.  If the parent is forking quickly make the child more
 1223          * batchy.
 1224          */
 1225         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
 1226 }
 1227 
 1228 /*
 1229  * Called when a lwp is being removed from this scheduler, typically
 1230  * during lwp_exit().  We have to clean out any ULOAD accounting before
 1231  * we can let the lp go.  The dd->spin lock is not needed for uload
 1232  * updates.
 1233  *
 1234  * Scheduler dequeueing has already occurred, no further action in that
 1235  * regard is needed.
 1236  */
 1237 static void
 1238 dfly_exiting(struct lwp *lp, struct proc *child_proc)
 1239 {
 1240         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
 1241 
 1242         if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 1243                 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
 1244                 atomic_add_int(&dd->uload, -lp->lwp_uload);
 1245                 atomic_add_int(&dd->ucount, -1);
 1246                 atomic_add_int(&dfly_ucount, -1);
 1247         }
 1248 }
 1249 
 1250 /*
 1251  * This function cannot block in any way, but spinlocks are ok.
 1252  *
 1253  * Update the uload based on the state of the thread (whether it is going
 1254  * to sleep or running again).  The uload is meant to be a longer-term
 1255  * load and not an instantanious load.
 1256  */
 1257 static void
 1258 dfly_uload_update(struct lwp *lp)
 1259 {
 1260         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
 1261 
 1262         if (lp->lwp_thread->td_flags & TDF_RUNQ) {
 1263                 if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
 1264                         spin_lock(&dd->spin);
 1265                         if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
 1266                                 atomic_set_int(&lp->lwp_mpflags,
 1267                                                LWP_MP_ULOAD);
 1268                                 atomic_add_int(&dd->uload, lp->lwp_uload);
 1269                                 atomic_add_int(&dd->ucount, 1);
 1270                                 atomic_add_int(&dfly_ucount, 1);
 1271                         }
 1272                         spin_unlock(&dd->spin);
 1273                 }
 1274         } else if (lp->lwp_slptime > 0) {
 1275                 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 1276                         spin_lock(&dd->spin);
 1277                         if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 1278                                 atomic_clear_int(&lp->lwp_mpflags,
 1279                                                  LWP_MP_ULOAD);
 1280                                 atomic_add_int(&dd->uload, -lp->lwp_uload);
 1281                                 atomic_add_int(&dd->ucount, -1);
 1282                                 atomic_add_int(&dfly_ucount, -1);
 1283                         }
 1284                         spin_unlock(&dd->spin);
 1285                 }
 1286         }
 1287 }
 1288 
 1289 /*
 1290  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
 1291  * it selects a user process and returns it.  If chklp is non-NULL and chklp
 1292  * has a better or equal priority then the process that would otherwise be
 1293  * chosen, NULL is returned.
 1294  *
 1295  * Until we fix the RUNQ code the chklp test has to be strict or we may
 1296  * bounce between processes trying to acquire the current process designation.
 1297  *
 1298  * Must be called with rdd->spin locked.  The spinlock is left intact through
 1299  * the entire routine.  dd->spin does not have to be locked.
 1300  *
 1301  * If worst is non-zero this function finds the worst thread instead of the
 1302  * best thread (used by the schedulerclock-based rover).
 1303  */
 1304 static
 1305 struct lwp *
 1306 dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
 1307                        struct lwp *chklp, int worst)
 1308 {
 1309         struct lwp *lp;
 1310         struct rq *q;
 1311         u_int32_t *which;
 1312         u_int32_t pri;
 1313         u_int32_t rtqbits;
 1314         u_int32_t tsqbits;
 1315         u_int32_t idqbits;
 1316 
 1317         rtqbits = rdd->rtqueuebits;
 1318         tsqbits = rdd->queuebits;
 1319         idqbits = rdd->idqueuebits;
 1320 
 1321         if (worst) {
 1322                 if (idqbits) {
 1323                         pri = bsrl(idqbits);
 1324                         q = &rdd->idqueues[pri];
 1325                         which = &rdd->idqueuebits;
 1326                 } else if (tsqbits) {
 1327                         pri = bsrl(tsqbits);
 1328                         q = &rdd->queues[pri];
 1329                         which = &rdd->queuebits;
 1330                 } else if (rtqbits) {
 1331                         pri = bsrl(rtqbits);
 1332                         q = &rdd->rtqueues[pri];
 1333                         which = &rdd->rtqueuebits;
 1334                 } else {
 1335                         return (NULL);
 1336                 }
 1337                 lp = TAILQ_LAST(q, rq);
 1338         } else {
 1339                 if (rtqbits) {
 1340                         pri = bsfl(rtqbits);
 1341                         q = &rdd->rtqueues[pri];
 1342                         which = &rdd->rtqueuebits;
 1343                 } else if (tsqbits) {
 1344                         pri = bsfl(tsqbits);
 1345                         q = &rdd->queues[pri];
 1346                         which = &rdd->queuebits;
 1347                 } else if (idqbits) {
 1348                         pri = bsfl(idqbits);
 1349                         q = &rdd->idqueues[pri];
 1350                         which = &rdd->idqueuebits;
 1351                 } else {
 1352                         return (NULL);
 1353                 }
 1354                 lp = TAILQ_FIRST(q);
 1355         }
 1356         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
 1357 
 1358         /*
 1359          * If the passed lwp <chklp> is reasonably close to the selected
 1360          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
 1361          *
 1362          * Note that we must error on the side of <chklp> to avoid bouncing
 1363          * between threads in the acquire code.
 1364          */
 1365         if (chklp) {
 1366                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
 1367                         return(NULL);
 1368         }
 1369 
 1370         KTR_COND_LOG(usched_chooseproc,
 1371             lp->lwp_proc->p_pid == usched_dfly_pid_debug,
 1372             lp->lwp_proc->p_pid,
 1373             lp->lwp_thread->td_gd->gd_cpuid,
 1374             mycpu->gd_cpuid);
 1375 
 1376         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
 1377         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
 1378         TAILQ_REMOVE(q, lp, lwp_procq);
 1379         --rdd->runqcount;
 1380         if (TAILQ_EMPTY(q))
 1381                 *which &= ~(1 << pri);
 1382 
 1383         /*
 1384          * If we are choosing a process from rdd with the intent to
 1385          * move it to dd, lwp_qcpu must be adjusted while rdd's spinlock
 1386          * is still held.
 1387          */
 1388         if (rdd != dd) {
 1389                 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 1390                         atomic_add_int(&rdd->uload, -lp->lwp_uload);
 1391                         atomic_add_int(&rdd->ucount, -1);
 1392                         atomic_add_int(&dfly_ucount, -1);
 1393                 }
 1394                 lp->lwp_qcpu = dd->cpuid;
 1395                 atomic_add_int(&dd->uload, lp->lwp_uload);
 1396                 atomic_add_int(&dd->ucount, 1);
 1397                 atomic_add_int(&dfly_ucount, 1);
 1398                 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
 1399         }
 1400         return lp;
 1401 }
 1402 
 1403 /*
 1404  * USED TO PUSH RUNNABLE LWPS TO THE LEAST LOADED CPU.
 1405  *
 1406  * Choose a cpu node to schedule lp on, hopefully nearby its current
 1407  * node.
 1408  *
 1409  * We give the current node a modest advantage for obvious reasons.
 1410  *
 1411  * We also give the node the thread was woken up FROM a slight advantage
 1412  * in order to try to schedule paired threads which synchronize/block waiting
 1413  * for each other fairly close to each other.  Similarly in a network setting
 1414  * this feature will also attempt to place a user process near the kernel
 1415  * protocol thread that is feeding it data.  THIS IS A CRITICAL PART of the
 1416  * algorithm as it heuristically groups synchronizing processes for locality
 1417  * of reference in multi-socket systems.
 1418  *
 1419  * We check against running processes and give a big advantage if there
 1420  * are none running.
 1421  *
 1422  * The caller will normally dfly_setrunqueue() lp on the returned queue.
 1423  *
 1424  * When the topology is known choose a cpu whos group has, in aggregate,
 1425  * has the lowest weighted load.
 1426  */
 1427 static
 1428 dfly_pcpu_t
 1429 dfly_choose_best_queue(struct lwp *lp)
 1430 {
 1431         cpumask_t wakemask;
 1432         cpumask_t mask;
 1433         cpu_node_t *cpup;
 1434         cpu_node_t *cpun;
 1435         cpu_node_t *cpub;
 1436         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
 1437         dfly_pcpu_t rdd;
 1438         int wakecpu;
 1439         int cpuid;
 1440         int n;
 1441         int count;
 1442         int load;
 1443         int lowest_load;
 1444 
 1445         /*
 1446          * When the topology is unknown choose a random cpu that is hopefully
 1447          * idle.
 1448          */
 1449         if (dd->cpunode == NULL)
 1450                 return (dfly_choose_queue_simple(dd, lp));
 1451 
 1452         /*
 1453          * Pairing mask
 1454          */
 1455         if ((wakecpu = lp->lwp_thread->td_wakefromcpu) >= 0)
 1456                 wakemask = dfly_pcpu[wakecpu].cpumask;
 1457         else
 1458                 wakemask = 0;
 1459 
 1460         /*
 1461          * When the topology is known choose a cpu whos group has, in
 1462          * aggregate, has the lowest weighted load.
 1463          */
 1464         cpup = root_cpu_node;
 1465         rdd = dd;
 1466 
 1467         while (cpup) {
 1468                 /*
 1469                  * Degenerate case super-root
 1470                  */
 1471                 if (cpup->child_node && cpup->child_no == 1) {
 1472                         cpup = cpup->child_node;
 1473                         continue;
 1474                 }
 1475 
 1476                 /*
 1477                  * Terminal cpunode
 1478                  */
 1479                 if (cpup->child_node == NULL) {
 1480                         rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)];
 1481                         break;
 1482                 }
 1483 
 1484                 cpub = NULL;
 1485                 lowest_load = 0x7FFFFFFF;
 1486 
 1487                 for (n = 0; n < cpup->child_no; ++n) {
 1488                         /*
 1489                          * Accumulate load information for all cpus
 1490                          * which are members of this node.
 1491                          */
 1492                         cpun = &cpup->child_node[n];
 1493                         mask = cpun->members & usched_global_cpumask &
 1494                                smp_active_mask & lp->lwp_cpumask;
 1495                         if (mask == 0)
 1496                                 continue;
 1497 
 1498                         count = 0;
 1499                         load = 0;
 1500 
 1501                         while (mask) {
 1502                                 cpuid = BSFCPUMASK(mask);
 1503                                 rdd = &dfly_pcpu[cpuid];
 1504                                 load += rdd->uload;
 1505                                 load += rdd->ucount * usched_dfly_weight3;
 1506 
 1507                                 if (rdd->uschedcp == NULL &&
 1508                                     rdd->runqcount == 0 &&
 1509                                     globaldata_find(cpuid)->gd_tdrunqcount == 0
 1510                                 ) {
 1511                                         load -= usched_dfly_weight4;
 1512                                 }
 1513 #if 0
 1514                                 else if (rdd->upri > lp->lwp_priority + PPQ) {
 1515                                         load -= usched_dfly_weight4 / 2;
 1516                                 }
 1517 #endif
 1518                                 mask &= ~CPUMASK(cpuid);
 1519                                 ++count;
 1520                         }
 1521 
 1522                         /*
 1523                          * Compensate if the lp is already accounted for in
 1524                          * the aggregate uload for this mask set.  We want
 1525                          * to calculate the loads as if lp were not present,
 1526                          * otherwise the calculation is bogus.
 1527                          */
 1528                         if ((lp->lwp_mpflags & LWP_MP_ULOAD) &&
 1529                             (dd->cpumask & cpun->members)) {
 1530                                 load -= lp->lwp_uload;
 1531                                 load -= usched_dfly_weight3;
 1532                         }
 1533 
 1534                         load /= count;
 1535 
 1536                         /*
 1537                          * Advantage the cpu group (lp) is already on.
 1538                          */
 1539                         if (cpun->members & dd->cpumask)
 1540                                 load -= usched_dfly_weight1;
 1541 
 1542                         /*
 1543                          * Advantage the cpu group we want to pair (lp) to,
 1544                          * but don't let it go to the exact same cpu as
 1545                          * the wakecpu target.
 1546                          *
 1547                          * We do this by checking whether cpun is a
 1548                          * terminal node or not.  All cpun's at the same
 1549                          * level will either all be terminal or all not
 1550                          * terminal.
 1551                          *
 1552                          * If it is and we match we disadvantage the load.
 1553                          * If it is and we don't match we advantage the load.
 1554                          *
 1555                          * Also note that we are effectively disadvantaging
 1556                          * all-but-one by the same amount, so it won't effect
 1557                          * the weight1 factor for the all-but-one nodes.
 1558                          */
 1559                         if (cpun->members & wakemask) {
 1560                                 if (cpun->child_node != NULL) {
 1561                                         /* advantage */
 1562                                         load -= usched_dfly_weight2;
 1563                                 } else {
 1564                                         if (usched_dfly_features & 0x10)
 1565                                                 load += usched_dfly_weight2;
 1566                                         else
 1567                                                 load -= usched_dfly_weight2;
 1568                                 }
 1569                         }
 1570 
 1571                         /*
 1572                          * Calculate the best load
 1573                          */
 1574                         if (cpub == NULL || lowest_load > load ||
 1575                             (lowest_load == load &&
 1576                              (cpun->members & dd->cpumask))
 1577                         ) {
 1578                                 lowest_load = load;
 1579                                 cpub = cpun;
 1580                         }
 1581                 }
 1582                 cpup = cpub;
 1583         }
 1584         if (usched_dfly_chooser)
 1585                 kprintf("lp %02d->%02d %s\n",
 1586                         lp->lwp_qcpu, rdd->cpuid, lp->lwp_proc->p_comm);
 1587         return (rdd);
 1588 }
 1589 
 1590 /*
 1591  * USED TO PULL RUNNABLE LWPS FROM THE MOST LOADED CPU.
 1592  *
 1593  * Choose the worst queue close to dd's cpu node with a non-empty runq
 1594  * that is NOT dd.  Also require that the moving of the highest-load thread
 1595  * from rdd to dd does not cause the uload's to cross each other.
 1596  *
 1597  * This is used by the thread chooser when the current cpu's queues are
 1598  * empty to steal a thread from another cpu's queue.  We want to offload
 1599  * the most heavily-loaded queue.
 1600  */
 1601 static
 1602 dfly_pcpu_t
 1603 dfly_choose_worst_queue(dfly_pcpu_t dd)
 1604 {
 1605         cpumask_t mask;
 1606         cpu_node_t *cpup;
 1607         cpu_node_t *cpun;
 1608         cpu_node_t *cpub;
 1609         dfly_pcpu_t rdd;
 1610         int cpuid;
 1611         int n;
 1612         int count;
 1613         int load;
 1614 #if 0
 1615         int pri;
 1616         int hpri;
 1617 #endif
 1618         int highest_load;
 1619 
 1620         /*
 1621          * When the topology is unknown choose a random cpu that is hopefully
 1622          * idle.
 1623          */
 1624         if (dd->cpunode == NULL) {
 1625                 return (NULL);
 1626         }
 1627 
 1628         /*
 1629          * When the topology is known choose a cpu whos group has, in
 1630          * aggregate, has the lowest weighted load.
 1631          */
 1632         cpup = root_cpu_node;
 1633         rdd = dd;
 1634         while (cpup) {
 1635                 /*
 1636                  * Degenerate case super-root
 1637                  */
 1638                 if (cpup->child_node && cpup->child_no == 1) {
 1639                         cpup = cpup->child_node;
 1640                         continue;
 1641                 }
 1642 
 1643                 /*
 1644                  * Terminal cpunode
 1645                  */
 1646                 if (cpup->child_node == NULL) {
 1647                         rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)];
 1648                         break;
 1649                 }
 1650 
 1651                 cpub = NULL;
 1652                 highest_load = 0;
 1653 
 1654                 for (n = 0; n < cpup->child_no; ++n) {
 1655                         /*
 1656                          * Accumulate load information for all cpus
 1657                          * which are members of this node.
 1658                          */
 1659                         cpun = &cpup->child_node[n];
 1660                         mask = cpun->members & usched_global_cpumask &
 1661                                smp_active_mask;
 1662                         if (mask == 0)
 1663                                 continue;
 1664                         count = 0;
 1665                         load = 0;
 1666 
 1667                         while (mask) {
 1668                                 cpuid = BSFCPUMASK(mask);
 1669                                 rdd = &dfly_pcpu[cpuid];
 1670                                 load += rdd->uload;
 1671                                 load += rdd->ucount * usched_dfly_weight3;
 1672                                 if (rdd->uschedcp == NULL &&
 1673                                     rdd->runqcount == 0 &&
 1674                                     globaldata_find(cpuid)->gd_tdrunqcount == 0
 1675                                 ) {
 1676                                         load -= usched_dfly_weight4;
 1677                                 }
 1678 #if 0
 1679                                 else if (rdd->upri > dd->upri + PPQ) {
 1680                                         load -= usched_dfly_weight4 / 2;
 1681                                 }
 1682 #endif
 1683                                 mask &= ~CPUMASK(cpuid);
 1684                                 ++count;
 1685                         }
 1686                         load /= count;
 1687 
 1688                         /*
 1689                          * Prefer candidates which are somewhat closer to
 1690                          * our cpu.
 1691                          */
 1692                         if (dd->cpumask & cpun->members)
 1693                                 load += usched_dfly_weight1;
 1694 
 1695                         /*
 1696                          * The best candidate is the one with the worst
 1697                          * (highest) load.
 1698                          */
 1699                         if (cpub == NULL || highest_load < load) {
 1700                                 highest_load = load;
 1701                                 cpub = cpun;
 1702                         }
 1703                 }
 1704                 cpup = cpub;
 1705         }
 1706 
 1707         /*
 1708          * We never return our own node (dd), and only return a remote
 1709          * node if it's load is significantly worse than ours (i.e. where
 1710          * stealing a thread would be considered reasonable).
 1711          *
 1712          * This also helps us avoid breaking paired threads apart which
 1713          * can have disastrous effects on performance.
 1714          */
 1715         if (rdd == dd)
 1716                 return(NULL);
 1717 
 1718 #if 0
 1719         hpri = 0;
 1720         if (rdd->rtqueuebits && hpri < (pri = bsrl(rdd->rtqueuebits)))
 1721                 hpri = pri;
 1722         if (rdd->queuebits && hpri < (pri = bsrl(rdd->queuebits)))
 1723                 hpri = pri;
 1724         if (rdd->idqueuebits && hpri < (pri = bsrl(rdd->idqueuebits)))
 1725                 hpri = pri;
 1726         hpri *= PPQ;
 1727         if (rdd->uload - hpri < dd->uload + hpri)
 1728                 return(NULL);
 1729 #endif
 1730         return (rdd);
 1731 }
 1732 
 1733 static
 1734 dfly_pcpu_t
 1735 dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp)
 1736 {
 1737         dfly_pcpu_t rdd;
 1738         cpumask_t tmpmask;
 1739         cpumask_t mask;
 1740         int cpuid;
 1741 
 1742         /*
 1743          * Fallback to the original heuristic, select random cpu,
 1744          * first checking cpus not currently running a user thread.
 1745          */
 1746         ++dfly_scancpu;
 1747         cpuid = (dfly_scancpu & 0xFFFF) % ncpus;
 1748         mask = ~dfly_curprocmask & dfly_rdyprocmask & lp->lwp_cpumask &
 1749                smp_active_mask & usched_global_cpumask;
 1750 
 1751         while (mask) {
 1752                 tmpmask = ~(CPUMASK(cpuid) - 1);
 1753                 if (mask & tmpmask)
 1754                         cpuid = BSFCPUMASK(mask & tmpmask);
 1755                 else
 1756                         cpuid = BSFCPUMASK(mask);
 1757                 rdd = &dfly_pcpu[cpuid];
 1758 
 1759                 if ((rdd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK))
 1760                         goto found;
 1761                 mask &= ~CPUMASK(cpuid);
 1762         }
 1763 
 1764         /*
 1765          * Then cpus which might have a currently running lp
 1766          */
 1767         cpuid = (dfly_scancpu & 0xFFFF) % ncpus;
 1768         mask = dfly_curprocmask & dfly_rdyprocmask &
 1769                lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
 1770 
 1771         while (mask) {
 1772                 tmpmask = ~(CPUMASK(cpuid) - 1);
 1773                 if (mask & tmpmask)
 1774                         cpuid = BSFCPUMASK(mask & tmpmask);
 1775                 else
 1776                         cpuid = BSFCPUMASK(mask);
 1777                 rdd = &dfly_pcpu[cpuid];
 1778 
 1779                 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
 1780                         goto found;
 1781                 mask &= ~CPUMASK(cpuid);
 1782         }
 1783 
 1784         /*
 1785          * If we cannot find a suitable cpu we reload from dfly_scancpu
 1786          * and round-robin.  Other cpus will pickup as they release their
 1787          * current lwps or become ready.
 1788          *
 1789          * Avoid a degenerate system lockup case if usched_global_cpumask
 1790          * is set to 0 or otherwise does not cover lwp_cpumask.
 1791          *
 1792          * We only kick the target helper thread in this case, we do not
 1793          * set the user resched flag because
 1794          */
 1795         cpuid = (dfly_scancpu & 0xFFFF) % ncpus;
 1796         if ((CPUMASK(cpuid) & usched_global_cpumask) == 0)
 1797                 cpuid = 0;
 1798         rdd = &dfly_pcpu[cpuid];
 1799 found:
 1800         return (rdd);
 1801 }
 1802 
 1803 static
 1804 void
 1805 dfly_need_user_resched_remote(void *dummy)
 1806 {
 1807         globaldata_t gd = mycpu;
 1808         dfly_pcpu_t  dd = &dfly_pcpu[gd->gd_cpuid];
 1809 
 1810         /*
 1811          * Flag reschedule needed
 1812          */
 1813         need_user_resched();
 1814 
 1815         /*
 1816          * If no user thread is currently running we need to kick the helper
 1817          * on our cpu to recover.  Otherwise the cpu will never schedule
 1818          * anything again.
 1819          *
 1820          * We cannot schedule the process ourselves because this is an
 1821          * IPI callback and we cannot acquire spinlocks in an IPI callback.
 1822          *
 1823          * Call wakeup_mycpu to avoid sending IPIs to other CPUs
 1824          */
 1825         if (dd->uschedcp == NULL && (dfly_rdyprocmask & gd->gd_cpumask)) {
 1826                 atomic_clear_cpumask(&dfly_rdyprocmask, gd->gd_cpumask);
 1827                 wakeup_mycpu(&dd->helper_thread);
 1828         }
 1829 }
 1830 
 1831 /*
 1832  * dfly_remrunqueue_locked() removes a given process from the run queue
 1833  * that it is on, clearing the queue busy bit if it becomes empty.
 1834  *
 1835  * Note that user process scheduler is different from the LWKT schedule.
 1836  * The user process scheduler only manages user processes but it uses LWKT
 1837  * underneath, and a user process operating in the kernel will often be
 1838  * 'released' from our management.
 1839  *
 1840  * uload is NOT adjusted here.  It is only adjusted if the lwkt_thread goes
 1841  * to sleep or the lwp is moved to a different runq.
 1842  */
 1843 static void
 1844 dfly_remrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
 1845 {
 1846         struct rq *q;
 1847         u_int32_t *which;
 1848         u_int8_t pri;
 1849 
 1850         KKASSERT(rdd->runqcount >= 0);
 1851 
 1852         pri = lp->lwp_rqindex;
 1853 
 1854         switch(lp->lwp_rqtype) {
 1855         case RTP_PRIO_NORMAL:
 1856                 q = &rdd->queues[pri];
 1857                 which = &rdd->queuebits;
 1858                 break;
 1859         case RTP_PRIO_REALTIME:
 1860         case RTP_PRIO_FIFO:
 1861                 q = &rdd->rtqueues[pri];
 1862                 which = &rdd->rtqueuebits;
 1863                 break;
 1864         case RTP_PRIO_IDLE:
 1865                 q = &rdd->idqueues[pri];
 1866                 which = &rdd->idqueuebits;
 1867                 break;
 1868         default:
 1869                 panic("remrunqueue: invalid rtprio type");
 1870                 /* NOT REACHED */
 1871         }
 1872         KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
 1873         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
 1874         TAILQ_REMOVE(q, lp, lwp_procq);
 1875         --rdd->runqcount;
 1876         if (TAILQ_EMPTY(q)) {
 1877                 KASSERT((*which & (1 << pri)) != 0,
 1878                         ("remrunqueue: remove from empty queue"));
 1879                 *which &= ~(1 << pri);
 1880         }
 1881 }
 1882 
 1883 /*
 1884  * dfly_setrunqueue_locked()
 1885  *
 1886  * Add a process whos rqtype and rqindex had previously been calculated
 1887  * onto the appropriate run queue.   Determine if the addition requires
 1888  * a reschedule on a cpu and return the cpuid or -1.
 1889  *
 1890  * NOTE:          Lower priorities are better priorities.
 1891  *
 1892  * NOTE ON ULOAD: This variable specifies the aggregate load on a cpu, the
 1893  *                sum of the rough lwp_priority for all running and runnable
 1894  *                processes.  Lower priority processes (higher lwp_priority
 1895  *                values) actually DO count as more load, not less, because
 1896  *                these are the programs which require the most care with
 1897  *                regards to cpu selection.
 1898  */
 1899 static void
 1900 dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
 1901 {
 1902         struct rq *q;
 1903         u_int32_t *which;
 1904         int pri;
 1905 
 1906         KKASSERT(lp->lwp_qcpu == rdd->cpuid);
 1907 
 1908         if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
 1909                 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
 1910                 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, lp->lwp_uload);
 1911                 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].ucount, 1);
 1912                 atomic_add_int(&dfly_ucount, 1);
 1913         }
 1914 
 1915         pri = lp->lwp_rqindex;
 1916 
 1917         switch(lp->lwp_rqtype) {
 1918         case RTP_PRIO_NORMAL:
 1919                 q = &rdd->queues[pri];
 1920                 which = &rdd->queuebits;
 1921                 break;
 1922         case RTP_PRIO_REALTIME:
 1923         case RTP_PRIO_FIFO:
 1924                 q = &rdd->rtqueues[pri];
 1925                 which = &rdd->rtqueuebits;
 1926                 break;
 1927         case RTP_PRIO_IDLE:
 1928                 q = &rdd->idqueues[pri];
 1929                 which = &rdd->idqueuebits;
 1930                 break;
 1931         default:
 1932                 panic("remrunqueue: invalid rtprio type");
 1933                 /* NOT REACHED */
 1934         }
 1935 
 1936         /*
 1937          * Place us on the selected queue.  Determine if we should be
 1938          * placed at the head of the queue or at the end.
 1939          *
 1940          * We are placed at the tail if our round-robin count has expired,
 1941          * or is about to expire and the system thinks its a good place to
 1942          * round-robin, or there is already a next thread on the queue
 1943          * (it might be trying to pick up where it left off and we don't
 1944          * want to interfere).
 1945          */
 1946         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 1947         atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
 1948         ++rdd->runqcount;
 1949 
 1950         if (lp->lwp_rrcount >= usched_dfly_rrinterval ||
 1951             (lp->lwp_rrcount >= usched_dfly_rrinterval / 2 &&
 1952              (lp->lwp_thread->td_mpflags & TDF_MP_BATCH_DEMARC)) ||
 1953             !TAILQ_EMPTY(q)
 1954         ) {
 1955                 atomic_clear_int(&lp->lwp_thread->td_mpflags,
 1956                                  TDF_MP_BATCH_DEMARC);
 1957                 lp->lwp_rrcount = 0;
 1958                 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
 1959         } else {
 1960                 if (TAILQ_EMPTY(q))
 1961                         lp->lwp_rrcount = 0;
 1962                 TAILQ_INSERT_HEAD(q, lp, lwp_procq);
 1963         }
 1964         *which |= 1 << pri;
 1965 }
 1966 
 1967 /*
 1968  * For SMP systems a user scheduler helper thread is created for each
 1969  * cpu and is used to allow one cpu to wakeup another for the purposes of
 1970  * scheduling userland threads from setrunqueue().
 1971  *
 1972  * UP systems do not need the helper since there is only one cpu.
 1973  *
 1974  * We can't use the idle thread for this because we might block.
 1975  * Additionally, doing things this way allows us to HLT idle cpus
 1976  * on MP systems.
 1977  */
 1978 static void
 1979 dfly_helper_thread(void *dummy)
 1980 {
 1981     globaldata_t gd;
 1982     dfly_pcpu_t dd;
 1983     dfly_pcpu_t rdd;
 1984     struct lwp *nlp;
 1985     cpumask_t mask;
 1986     int cpuid;
 1987 
 1988     gd = mycpu;
 1989     cpuid = gd->gd_cpuid;       /* doesn't change */
 1990     mask = gd->gd_cpumask;      /* doesn't change */
 1991     dd = &dfly_pcpu[cpuid];
 1992 
 1993     /*
 1994      * Since we only want to be woken up only when no user processes
 1995      * are scheduled on a cpu, run at an ultra low priority.
 1996      */
 1997     lwkt_setpri_self(TDPRI_USER_SCHEDULER);
 1998 
 1999     tsleep(&dd->helper_thread, 0, "schslp", 0);
 2000 
 2001     for (;;) {
 2002         /*
 2003          * We use the LWKT deschedule-interlock trick to avoid racing
 2004          * dfly_rdyprocmask.  This means we cannot block through to the
 2005          * manual lwkt_switch() call we make below.
 2006          */
 2007         crit_enter_gd(gd);
 2008         tsleep_interlock(&dd->helper_thread, 0);
 2009 
 2010         spin_lock(&dd->spin);
 2011 
 2012         atomic_set_cpumask(&dfly_rdyprocmask, mask);
 2013         clear_user_resched();   /* This satisfied the reschedule request */
 2014 #if 0
 2015         dd->rrcount = 0;        /* Reset the round-robin counter */
 2016 #endif
 2017 
 2018         if (dd->runqcount || dd->uschedcp != NULL) {
 2019                 /*
 2020                  * Threads are available.  A thread may or may not be
 2021                  * currently scheduled.  Get the best thread already queued
 2022                  * to this cpu.
 2023                  */
 2024                 nlp = dfly_chooseproc_locked(dd, dd, dd->uschedcp, 0);
 2025                 if (nlp) {
 2026                         atomic_set_cpumask(&dfly_curprocmask, mask);
 2027                         dd->upri = nlp->lwp_priority;
 2028                         dd->uschedcp = nlp;
 2029 #if 0
 2030                         dd->rrcount = 0;        /* reset round robin */
 2031 #endif
 2032                         spin_unlock(&dd->spin);
 2033                         lwkt_acquire(nlp->lwp_thread);
 2034                         lwkt_schedule(nlp->lwp_thread);
 2035                 } else {
 2036                         /*
 2037                          * This situation should not occur because we had
 2038                          * at least one thread available.
 2039                          */
 2040                         spin_unlock(&dd->spin);
 2041                 }
 2042         } else if (usched_dfly_features & 0x01) {
 2043                 /*
 2044                  * This cpu is devoid of runnable threads, steal a thread
 2045                  * from another cpu.  Since we're stealing, might as well
 2046                  * load balance at the same time.
 2047                  *
 2048                  * We choose the highest-loaded thread from the worst queue.
 2049                  *
 2050                  * NOTE! This function only returns a non-NULL rdd when
 2051                  *       another cpu's queue is obviously overloaded.  We
 2052                  *       do not want to perform the type of rebalancing
 2053                  *       the schedclock does here because it would result
 2054                  *       in insane process pulling when 'steady' state is
 2055                  *       partially unbalanced (e.g. 6 runnables and only
 2056                  *       4 cores).
 2057                  */
 2058                 rdd = dfly_choose_worst_queue(dd);
 2059                 if (rdd && spin_trylock(&rdd->spin)) {
 2060                         nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
 2061                         spin_unlock(&rdd->spin);
 2062                 } else {
 2063                         nlp = NULL;
 2064                 }
 2065                 if (nlp) {
 2066                         atomic_set_cpumask(&dfly_curprocmask, mask);
 2067                         dd->upri = nlp->lwp_priority;
 2068                         dd->uschedcp = nlp;
 2069 #if 0
 2070                         dd->rrcount = 0;        /* reset round robin */
 2071 #endif
 2072                         spin_unlock(&dd->spin);
 2073                         lwkt_acquire(nlp->lwp_thread);
 2074                         lwkt_schedule(nlp->lwp_thread);
 2075                 } else {
 2076                         /*
 2077                          * Leave the thread on our run queue.  Another
 2078                          * scheduler will try to pull it later.
 2079                          */
 2080                         spin_unlock(&dd->spin);
 2081                 }
 2082         } else {
 2083                 /*
 2084                  * devoid of runnable threads and not allowed to steal
 2085                  * any.
 2086                  */
 2087                 spin_unlock(&dd->spin);
 2088         }
 2089 
 2090         /*
 2091          * We're descheduled unless someone scheduled us.  Switch away.
 2092          * Exiting the critical section will cause splz() to be called
 2093          * for us if interrupts and such are pending.
 2094          */
 2095         crit_exit_gd(gd);
 2096         tsleep(&dd->helper_thread, PINTERLOCKED, "schslp", 0);
 2097     }
 2098 }
 2099 
 2100 #if 0
 2101 static int
 2102 sysctl_usched_dfly_stick_to_level(SYSCTL_HANDLER_ARGS)
 2103 {
 2104         int error, new_val;
 2105 
 2106         new_val = usched_dfly_stick_to_level;
 2107 
 2108         error = sysctl_handle_int(oidp, &new_val, 0, req);
 2109         if (error != 0 || req->newptr == NULL)
 2110                 return (error);
 2111         if (new_val > cpu_topology_levels_number - 1 || new_val < 0)
 2112                 return (EINVAL);
 2113         usched_dfly_stick_to_level = new_val;
 2114         return (0);
 2115 }
 2116 #endif
 2117 
 2118 /*
 2119  * Setup the queues and scheduler helpers (scheduler helpers are SMP only).
 2120  * Note that curprocmask bit 0 has already been cleared by rqinit() and
 2121  * we should not mess with it further.
 2122  */
 2123 static void
 2124 usched_dfly_cpu_init(void)
 2125 {
 2126         int i;
 2127         int j;
 2128         int cpuid;
 2129         int smt_not_supported = 0;
 2130         int cache_coherent_not_supported = 0;
 2131 
 2132         if (bootverbose)
 2133                 kprintf("Start scheduler helpers on cpus:\n");
 2134 
 2135         sysctl_ctx_init(&usched_dfly_sysctl_ctx);
 2136         usched_dfly_sysctl_tree =
 2137                 SYSCTL_ADD_NODE(&usched_dfly_sysctl_ctx,
 2138                                 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
 2139                                 "usched_dfly", CTLFLAG_RD, 0, "");
 2140 
 2141         for (i = 0; i < ncpus; ++i) {
 2142                 dfly_pcpu_t dd = &dfly_pcpu[i];
 2143                 cpumask_t mask = CPUMASK(i);
 2144 
 2145                 if ((mask & smp_active_mask) == 0)
 2146                     continue;
 2147 
 2148                 spin_init(&dd->spin);
 2149                 dd->cpunode = get_cpu_node_by_cpuid(i);
 2150                 dd->cpuid = i;
 2151                 dd->cpumask = CPUMASK(i);
 2152                 for (j = 0; j < NQS; j++) {
 2153                         TAILQ_INIT(&dd->queues[j]);
 2154                         TAILQ_INIT(&dd->rtqueues[j]);
 2155                         TAILQ_INIT(&dd->idqueues[j]);
 2156                 }
 2157                 atomic_clear_cpumask(&dfly_curprocmask, 1);
 2158 
 2159                 if (dd->cpunode == NULL) {
 2160                         smt_not_supported = 1;
 2161                         cache_coherent_not_supported = 1;
 2162                         if (bootverbose)
 2163                                 kprintf ("\tcpu%d - WARNING: No CPU NODE "
 2164                                          "found for cpu\n", i);
 2165                 } else {
 2166                         switch (dd->cpunode->type) {
 2167                         case THREAD_LEVEL:
 2168                                 if (bootverbose)
 2169                                         kprintf ("\tcpu%d - HyperThreading "
 2170                                                  "available. Core siblings: ",
 2171                                                  i);
 2172                                 break;
 2173                         case CORE_LEVEL:
 2174                                 smt_not_supported = 1;
 2175 
 2176                                 if (bootverbose)
 2177                                         kprintf ("\tcpu%d - No HT available, "
 2178                                                  "multi-core/physical "
 2179                                                  "cpu. Physical siblings: ",
 2180                                                  i);
 2181                                 break;
 2182                         case CHIP_LEVEL:
 2183                                 smt_not_supported = 1;
 2184 
 2185                                 if (bootverbose)
 2186                                         kprintf ("\tcpu%d - No HT available, "
 2187                                                  "single-core/physical cpu. "
 2188                                                  "Package Siblings: ",
 2189                                                  i);
 2190                                 break;
 2191                         default:
 2192                                 /* Let's go for safe defaults here */
 2193                                 smt_not_supported = 1;
 2194                                 cache_coherent_not_supported = 1;
 2195                                 if (bootverbose)
 2196                                         kprintf ("\tcpu%d - Unknown cpunode->"
 2197                                                  "type=%u. Siblings: ",
 2198                                                  i,
 2199                                                  (u_int)dd->cpunode->type);
 2200                                 break;
 2201                         }
 2202 
 2203                         if (bootverbose) {
 2204                                 if (dd->cpunode->parent_node != NULL) {
 2205                                         CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members)
 2206                                                 kprintf("cpu%d ", cpuid);
 2207                                         kprintf("\n");
 2208                                 } else {
 2209                                         kprintf(" no siblings\n");
 2210                                 }
 2211                         }
 2212                 }
 2213 
 2214                 lwkt_create(dfly_helper_thread, NULL, NULL, &dd->helper_thread,
 2215                             0, i, "usched %d", i);
 2216 
 2217                 /*
 2218                  * Allow user scheduling on the target cpu.  cpu #0 has already
 2219                  * been enabled in rqinit().
 2220                  */
 2221                 if (i)
 2222                     atomic_clear_cpumask(&dfly_curprocmask, mask);
 2223                 atomic_set_cpumask(&dfly_rdyprocmask, mask);
 2224                 dd->upri = PRIBASE_NULL;
 2225 
 2226         }
 2227 
 2228         /* usched_dfly sysctl configurable parameters */
 2229 
 2230         SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2231                        SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2232                        OID_AUTO, "rrinterval", CTLFLAG_RW,
 2233                        &usched_dfly_rrinterval, 0, "");
 2234         SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2235                        SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2236                        OID_AUTO, "decay", CTLFLAG_RW,
 2237                        &usched_dfly_decay, 0, "Extra decay when not running");
 2238 
 2239         /* Add enable/disable option for SMT scheduling if supported */
 2240         if (smt_not_supported) {
 2241                 usched_dfly_smt = 0;
 2242                 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx,
 2243                                   SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2244                                   OID_AUTO, "smt", CTLFLAG_RD,
 2245                                   "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
 2246         } else {
 2247                 usched_dfly_smt = 1;
 2248                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2249                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2250                                OID_AUTO, "smt", CTLFLAG_RW,
 2251                                &usched_dfly_smt, 0, "Enable SMT scheduling");
 2252         }
 2253 
 2254         /*
 2255          * Add enable/disable option for cache coherent scheduling
 2256          * if supported
 2257          */
 2258         if (cache_coherent_not_supported) {
 2259                 usched_dfly_cache_coherent = 0;
 2260                 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx,
 2261                                   SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2262                                   OID_AUTO, "cache_coherent", CTLFLAG_RD,
 2263                                   "NOT SUPPORTED", 0,
 2264                                   "Cache coherence NOT SUPPORTED");
 2265         } else {
 2266                 usched_dfly_cache_coherent = 1;
 2267                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2268                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2269                                OID_AUTO, "cache_coherent", CTLFLAG_RW,
 2270                                &usched_dfly_cache_coherent, 0,
 2271                                "Enable/Disable cache coherent scheduling");
 2272 
 2273                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2274                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2275                                OID_AUTO, "weight1", CTLFLAG_RW,
 2276                                &usched_dfly_weight1, 200,
 2277                                "Weight selection for current cpu");
 2278 
 2279                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2280                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2281                                OID_AUTO, "weight2", CTLFLAG_RW,
 2282                                &usched_dfly_weight2, 180,
 2283                                "Weight selection for wakefrom cpu");
 2284 
 2285                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2286                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2287                                OID_AUTO, "weight3", CTLFLAG_RW,
 2288                                &usched_dfly_weight3, 40,
 2289                                "Weight selection for num threads on queue");
 2290 
 2291                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2292                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2293                                OID_AUTO, "weight4", CTLFLAG_RW,
 2294                                &usched_dfly_weight4, 160,
 2295                                "Availability of other idle cpus");
 2296 
 2297                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2298                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2299                                OID_AUTO, "fast_resched", CTLFLAG_RW,
 2300                                &usched_dfly_fast_resched, 0,
 2301                                "Availability of other idle cpus");
 2302 
 2303                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2304                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2305                                OID_AUTO, "features", CTLFLAG_RW,
 2306                                &usched_dfly_features, 0x8F,
 2307                                "Allow pulls into empty queues");
 2308 
 2309                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
 2310                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2311                                OID_AUTO, "swmask", CTLFLAG_RW,
 2312                                &usched_dfly_swmask, ~PPQMASK,
 2313                                "Queue mask to force thread switch");
 2314 
 2315 #if 0
 2316                 SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx,
 2317                                 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
 2318                                 OID_AUTO, "stick_to_level",
 2319                                 CTLTYPE_INT | CTLFLAG_RW,
 2320                                 NULL, sizeof usched_dfly_stick_to_level,
 2321                                 sysctl_usched_dfly_stick_to_level, "I",
 2322                                 "Stick a process to this level. See sysctl"
 2323                                 "paremter hw.cpu_topology.level_description");
 2324 #endif
 2325         }
 2326 }
 2327 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
 2328         usched_dfly_cpu_init, NULL)
Cache object: 342c259cc28532ac3f476b06291c4ddd
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/usched_dfly.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/usched_dfly.c