fxr.watson.org: FREEBSD-7-STABLE sys/kern/sched

FreeBSD/Linux Kernel Cross Reference
sys/kern/sched_ule.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10

1 /*- 2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 /* 28 * This file implements the ULE scheduler. ULE supports independent CPU 29 * run queues and fine grain locking. It has superior interactive 30 * performance under load even on uni-processor systems. 31 * 32 * etymology: 33 * ULE is the last three letters in schedule. It owes its name to a 34 * generic user created for a scheduling system by Paul Mikesell at 35 * Isilon Systems and a general lack of creativity on the part of the author. 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include "opt_hwpmc_hooks.h" 42 #include "opt_kdtrace.h" 43 #include "opt_sched.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kdb.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/proc.h> 53 #include <sys/resource.h> 54 #include <sys/resourcevar.h> 55 #include <sys/sched.h> 56 #include <sys/smp.h> 57 #include <sys/sx.h> 58 #include <sys/sysctl.h> 59 #include <sys/sysproto.h> 60 #include <sys/turnstile.h> 61 #include <sys/umtx.h> 62 #include <sys/vmmeter.h> 63 #include <sys/cpuset.h> 64 #ifdef KTRACE 65 #include <sys/uio.h> 66 #include <sys/ktrace.h> 67 #endif 68 69 #ifdef HWPMC_HOOKS 70 #include <sys/pmckern.h> 71 #endif 72 73 #ifdef KDTRACE_HOOKS 74 #include <sys/dtrace_bsd.h> 75 int dtrace_vtime_active; 76 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 77 #endif 78 79 #include <machine/cpu.h> 80 #include <machine/smp.h> 81 82 #if !defined(__i386__) && !defined(__amd64__) && !defined(__arm__) 83 #error "This architecture is not currently compatible with ULE" 84 #endif 85 86 #define KTR_ULE 0 87 88 /* 89 * Thread scheduler specific section. All fields are protected 90 * by the thread lock. 91 */ 92 struct td_sched { 93 TAILQ_ENTRY(td_sched) ts_procq; /* Run queue. */ 94 struct thread *ts_thread; /* Active associated thread. */ 95 struct runq *ts_runq; /* Run-queue we're queued on. */ 96 short ts_flags; /* TSF_* flags. */ 97 u_char ts_rqindex; /* Run queue index. */ 98 u_char ts_cpu; /* CPU that we have affinity for. */ 99 int ts_slice; /* Ticks of slice remaining. */ 100 u_int ts_slptime; /* Number of ticks we vol. slept */ 101 u_int ts_runtime; /* Number of ticks we were running */ 102 /* The following variables are only used for pctcpu calculation */ 103 int ts_ltick; /* Last tick that we were running on */ 104 int ts_incrtick; /* Last tick that we incremented on */ 105 int ts_ftick; /* First tick that we were running on */ 106 int ts_ticks; /* Tick count */ 107 #ifdef SMP 108 int ts_rltick; /* Real last tick, for affinity. */ 109 #endif 110 }; 111 /* flags kept in ts_flags */ 112 #define TSF_BOUND 0x0001 /* Thread can not migrate. */ 113 #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ 114 115 static struct td_sched td_sched0; 116 117 #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) 118 #define THREAD_CAN_SCHED(td, cpu) \ 119 CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) 120 121 /* 122 * Priority ranges used for interactive and non-interactive timeshare 123 * threads. Interactive threads use realtime priorities. 124 */ 125 #define PRI_MIN_INTERACT PRI_MIN_REALTIME 126 #define PRI_MAX_INTERACT PRI_MAX_REALTIME 127 #define PRI_MIN_BATCH PRI_MIN_TIMESHARE 128 #define PRI_MAX_BATCH PRI_MAX_TIMESHARE 129 130 /* 131 * Cpu percentage computation macros and defines. 132 * 133 * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. 134 * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. 135 * SCHED_TICK_MAX: Maximum number of ticks before scaling back. 136 * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 137 * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. 138 * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. 139 */ 140 #define SCHED_TICK_SECS 10 141 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) 142 #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) 143 #define SCHED_TICK_SHIFT 10 144 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) 145 #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) 146 147 /* 148 * These macros determine priorities for non-interactive threads. They are 149 * assigned a priority based on their recent cpu utilization as expressed 150 * by the ratio of ticks to the tick total. NHALF priorities at the start 151 * and end of the MIN to MAX timeshare range are only reachable with negative 152 * or positive nice respectively. 153 * 154 * PRI_RANGE: Priority range for utilization dependent priorities. 155 * PRI_NRESV: Number of nice values. 156 * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. 157 * PRI_NICE: Determines the part of the priority inherited from nice. 158 */ 159 #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) 160 #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 161 #define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF) 162 #define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF) 163 #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) 164 #define SCHED_PRI_TICKS(ts) \ 165 (SCHED_TICK_HZ((ts)) / \ 166 (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) 167 #define SCHED_PRI_NICE(nice) (nice) 168 169 /* 170 * These determine the interactivity of a process. Interactivity differs from 171 * cpu utilization in that it expresses the voluntary time slept vs time ran 172 * while cpu utilization includes all time not running. This more accurately 173 * models the intent of the thread. 174 * 175 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 176 * before throttling back. 177 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 178 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 179 * INTERACT_THRESH: Threshhold for placement on the current runq. 180 */ 181 #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 182 #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 183 #define SCHED_INTERACT_MAX (100) 184 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 185 #define SCHED_INTERACT_THRESH (30) 186 187 /* 188 * tickincr: Converts a stathz tick into a hz domain scaled by 189 * the shift factor. Without the shift the error rate 190 * due to rounding would be unacceptably high. 191 * realstathz: stathz is sometimes 0 and run off of hz. 192 * sched_slice: Runtime of each thread before rescheduling. 193 * preempt_thresh: Priority threshold for preemption and remote IPIs. 194 */ 195 static int sched_interact = SCHED_INTERACT_THRESH; 196 static int realstathz; 197 static int tickincr; 198 static int sched_slice; 199 #ifdef PREEMPTION 200 #ifdef FULL_PREEMPTION 201 static int preempt_thresh = PRI_MAX_IDLE; 202 #else 203 static int preempt_thresh = PRI_MIN_KERN; 204 #endif 205 #else 206 static int preempt_thresh = 0; 207 #endif 208 209 /* 210 * tdq - per processor runqs and statistics. All fields are protected by the 211 * tdq_lock. The load and lowpri may be accessed without to avoid excess 212 * locking in sched_pickcpu(); 213 */ 214 struct tdq { 215 struct mtx *tdq_lock; /* Pointer to group lock. */ 216 struct runq tdq_realtime; /* real-time run queue. */ 217 struct runq tdq_timeshare; /* timeshare run queue. */ 218 struct runq tdq_idle; /* Queue of IDLE threads. */ 219 int tdq_load; /* Aggregate load. */ 220 u_char tdq_idx; /* Current insert index. */ 221 u_char tdq_ridx; /* Current removal index. */ 222 #ifdef SMP 223 u_char tdq_lowpri; /* Lowest priority thread. */ 224 int tdq_transferable; /* Transferable thread count. */ 225 LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ 226 struct tdq_group *tdq_group; /* Our processor group. */ 227 #else 228 int tdq_sysload; /* For loadavg, !ITHD load. */ 229 #endif 230 } __aligned(64); 231 232 233 #ifdef SMP 234 /* 235 * tdq groups are groups of processors which can cheaply share threads. When 236 * one processor in the group goes idle it will check the runqs of the other 237 * processors in its group prior to halting and waiting for an interrupt. 238 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 239 * In a numa environment we'd want an idle bitmap per group and a two tiered 240 * load balancer. 241 */ 242 struct tdq_group { 243 struct mtx tdg_lock; /* Protects all fields below. */ 244 int tdg_cpus; /* Count of CPUs in this tdq group. */ 245 cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ 246 cpumask_t tdg_idlemask; /* Idle cpus in this group. */ 247 cpumask_t tdg_mask; /* Bit mask for first cpu. */ 248 int tdg_load; /* Total load of this group. */ 249 int tdg_transferable; /* Transferable load of this group. */ 250 LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ 251 char tdg_name[16]; /* lock name. */ 252 } __aligned(64); 253 254 #define SCHED_AFFINITY_DEFAULT (max(1, hz / 300)) 255 #define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) 256 257 /* 258 * Run-time tunables. 259 */ 260 static int rebalance = 1; 261 static int balance_interval = 128; /* Default set in sched_initticks(). */ 262 static int pick_pri = 1; 263 static int affinity; 264 static int tryself = 1; 265 static int steal_htt = 1; 266 static int steal_idle = 1; 267 static int steal_thresh = 2; 268 static int topology = 0; 269 270 /* 271 * One thread queue per processor. 272 */ 273 static volatile cpumask_t tdq_idle; 274 static int tdg_maxid; 275 static struct tdq tdq_cpu[MAXCPU]; 276 static struct tdq_group tdq_groups[MAXCPU]; 277 static struct tdq *balance_tdq; 278 static int balance_group_ticks; 279 static int balance_ticks; 280 281 #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) 282 #define TDQ_CPU(x) (&tdq_cpu[(x)]) 283 #define TDQ_ID(x) ((int)((x) - tdq_cpu)) 284 #define TDQ_GROUP(x) (&tdq_groups[(x)]) 285 #define TDG_ID(x) ((int)((x) - tdq_groups)) 286 #else /* !SMP */ 287 static struct tdq tdq_cpu; 288 static struct mtx tdq_lock; 289 290 #define TDQ_ID(x) (0) 291 #define TDQ_SELF() (&tdq_cpu) 292 #define TDQ_CPU(x) (&tdq_cpu) 293 #endif 294 295 #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) 296 #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) 297 #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) 298 #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) 299 #define TDQ_LOCKPTR(t) ((t)->tdq_lock) 300 301 static void sched_priority(struct thread *); 302 static void sched_thread_priority(struct thread *, u_char); 303 static int sched_interact_score(struct thread *); 304 static void sched_interact_update(struct thread *); 305 static void sched_interact_fork(struct thread *); 306 static void sched_pctcpu_update(struct td_sched *); 307 308 /* Operations on per processor queues */ 309 static struct td_sched * tdq_choose(struct tdq *); 310 static void tdq_setup(struct tdq *); 311 static void tdq_load_add(struct tdq *, struct td_sched *); 312 static void tdq_load_rem(struct tdq *, struct td_sched *); 313 static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); 314 static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); 315 void tdq_print(int cpu); 316 static void runq_print(struct runq *rq); 317 static void tdq_add(struct tdq *, struct thread *, int); 318 #ifdef SMP 319 static void tdq_move(struct tdq *, struct tdq *); 320 static int tdq_idled(struct tdq *); 321 static void tdq_notify(struct td_sched *); 322 static struct td_sched *tdq_steal(struct tdq *, int); 323 static struct td_sched *runq_steal(struct runq *, int); 324 static int sched_pickcpu(struct thread *, int); 325 static void sched_balance(void); 326 static void sched_balance_groups(void); 327 static void sched_balance_group(struct tdq_group *); 328 static void sched_balance_pair(struct tdq *, struct tdq *); 329 static inline struct tdq *sched_setcpu(struct td_sched *, int, int); 330 static inline void thread_unblock_switch(struct thread *, struct mtx *); 331 static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); 332 #endif 333 334 static void sched_setup(void *dummy); 335 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); 336 337 static void sched_initticks(void *dummy); 338 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, 339 NULL); 340 341 /* 342 * Print the threads waiting on a run-queue. 343 */ 344 static void 345 runq_print(struct runq *rq) 346 { 347 struct rqhead *rqh; 348 struct td_sched *ts; 349 int pri; 350 int j; 351 int i; 352 353 for (i = 0; i < RQB_LEN; i++) { 354 printf("\t\trunq bits %d 0x%zx\n", 355 i, rq->rq_status.rqb_bits[i]); 356 for (j = 0; j < RQB_BPW; j++) 357 if (rq->rq_status.rqb_bits[i] & (1ul << j)) { 358 pri = j + (i << RQB_L2BPW); 359 rqh = &rq->rq_queues[pri]; 360 TAILQ_FOREACH(ts, rqh, ts_procq) { 361 printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 362 ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri); 363 } 364 } 365 } 366 } 367 368 /* 369 * Print the status of a per-cpu thread queue. Should be a ddb show cmd. 370 */ 371 void 372 tdq_print(int cpu) 373 { 374 struct tdq *tdq; 375 376 tdq = TDQ_CPU(cpu); 377 378 printf("tdq %d:\n", TDQ_ID(tdq)); 379 printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); 380 printf("\tload: %d\n", tdq->tdq_load); 381 printf("\ttimeshare idx: %d\n", tdq->tdq_idx); 382 printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); 383 printf("\trealtime runq:\n"); 384 runq_print(&tdq->tdq_realtime); 385 printf("\ttimeshare runq:\n"); 386 runq_print(&tdq->tdq_timeshare); 387 printf("\tidle runq:\n"); 388 runq_print(&tdq->tdq_idle); 389 #ifdef SMP 390 printf("\tload transferable: %d\n", tdq->tdq_transferable); 391 printf("\tlowest priority: %d\n", tdq->tdq_lowpri); 392 printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group)); 393 printf("\tLock name: %s\n", tdq->tdq_group->tdg_name); 394 #endif 395 } 396 397 #define TS_RQ_PPQ (((PRI_MAX_BATCH - PRI_MIN_BATCH) + 1) / RQ_NQS) 398 /* 399 * Add a thread to the actual run-queue. Keeps transferable counts up to 400 * date with what is actually on the run-queue. Selects the correct 401 * queue position for timeshare threads. 402 */ 403 static __inline void 404 tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) 405 { 406 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 407 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 408 #ifdef SMP 409 if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 410 tdq->tdq_transferable++; 411 tdq->tdq_group->tdg_transferable++; 412 ts->ts_flags |= TSF_XFERABLE; 413 } 414 #endif 415 if (ts->ts_runq == &tdq->tdq_timeshare) { 416 u_char pri; 417 418 pri = ts->ts_thread->td_priority; 419 KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, 420 ("Invalid priority %d on timeshare runq", pri)); 421 /* 422 * This queue contains only priorities between MIN and MAX 423 * realtime. Use the whole queue to represent these values. 424 */ 425 if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { 426 pri = (pri - PRI_MIN_BATCH) / TS_RQ_PPQ; 427 pri = (pri + tdq->tdq_idx) % RQ_NQS; 428 /* 429 * This effectively shortens the queue by one so we 430 * can have a one slot difference between idx and 431 * ridx while we wait for threads to drain. 432 */ 433 if (tdq->tdq_ridx != tdq->tdq_idx && 434 pri == tdq->tdq_ridx) 435 pri = (unsigned char)(pri - 1) % RQ_NQS; 436 } else 437 pri = tdq->tdq_ridx; 438 runq_add_pri(ts->ts_runq, ts, pri, flags); 439 } else 440 runq_add(ts->ts_runq, ts, flags); 441 } 442 443 /* 444 * Remove a thread from a run-queue. This typically happens when a thread 445 * is selected to run. Running threads are not on the queue and the 446 * transferable count does not reflect them. 447 */ 448 static __inline void 449 tdq_runq_rem(struct tdq *tdq, struct td_sched *ts) 450 { 451 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 452 KASSERT(ts->ts_runq != NULL, 453 ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread)); 454 #ifdef SMP 455 if (ts->ts_flags & TSF_XFERABLE) { 456 tdq->tdq_transferable--; 457 tdq->tdq_group->tdg_transferable--; 458 ts->ts_flags &= ~TSF_XFERABLE; 459 } 460 #endif 461 if (ts->ts_runq == &tdq->tdq_timeshare) { 462 if (tdq->tdq_idx != tdq->tdq_ridx) 463 runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); 464 else 465 runq_remove_idx(ts->ts_runq, ts, NULL); 466 /* 467 * For timeshare threads we update the priority here so 468 * the priority reflects the time we've been sleeping. 469 */ 470 ts->ts_ltick = ticks; 471 sched_pctcpu_update(ts); 472 sched_priority(ts->ts_thread); 473 } else 474 runq_remove(ts->ts_runq, ts); 475 } 476 477 /* 478 * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load 479 * for this thread to the referenced thread queue. 480 */ 481 static void 482 tdq_load_add(struct tdq *tdq, struct td_sched *ts) 483 { 484 int class; 485 486 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 487 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 488 class = PRI_BASE(ts->ts_thread->td_pri_class); 489 tdq->tdq_load++; 490 CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); 491 if (class != PRI_ITHD && 492 (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 493 #ifdef SMP 494 tdq->tdq_group->tdg_load++; 495 #else 496 tdq->tdq_sysload++; 497 #endif 498 } 499 500 /* 501 * Remove the load from a thread that is transitioning to a sleep state or 502 * exiting. 503 */ 504 static void 505 tdq_load_rem(struct tdq *tdq, struct td_sched *ts) 506 { 507 int class; 508 509 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 510 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 511 class = PRI_BASE(ts->ts_thread->td_pri_class); 512 if (class != PRI_ITHD && 513 (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 514 #ifdef SMP 515 tdq->tdq_group->tdg_load--; 516 #else 517 tdq->tdq_sysload--; 518 #endif 519 KASSERT(tdq->tdq_load != 0, 520 ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); 521 tdq->tdq_load--; 522 CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 523 ts->ts_runq = NULL; 524 } 525 526 #ifdef SMP 527 /* 528 * sched_balance is a simple CPU load balancing algorithm. It operates by 529 * finding the least loaded and most loaded cpu and equalizing their load 530 * by migrating some processes. 531 * 532 * Dealing only with two CPUs at a time has two advantages. Firstly, most 533 * installations will only have 2 cpus. Secondly, load balancing too much at 534 * once can have an unpleasant effect on the system. The scheduler rarely has 535 * enough information to make perfect decisions. So this algorithm chooses 536 * simplicity and more gradual effects on load in larger systems. 537 * 538 */ 539 static void 540 sched_balance() 541 { 542 struct tdq_group *high; 543 struct tdq_group *low; 544 struct tdq_group *tdg; 545 struct tdq *tdq; 546 int cnt; 547 int i; 548 549 /* 550 * Select a random time between .5 * balance_interval and 551 * 1.5 * balance_interval. 552 */ 553 balance_ticks = max(balance_interval / 2, 1); 554 balance_ticks += random() % balance_interval; 555 if (smp_started == 0 || rebalance == 0) 556 return; 557 tdq = TDQ_SELF(); 558 TDQ_UNLOCK(tdq); 559 low = high = NULL; 560 i = random() % (tdg_maxid + 1); 561 for (cnt = 0; cnt <= tdg_maxid; cnt++) { 562 tdg = TDQ_GROUP(i); 563 /* 564 * Find the CPU with the highest load that has some 565 * threads to transfer. 566 */ 567 if ((high == NULL || tdg->tdg_load > high->tdg_load) 568 && tdg->tdg_transferable) 569 high = tdg; 570 if (low == NULL || tdg->tdg_load < low->tdg_load) 571 low = tdg; 572 if (++i > tdg_maxid) 573 i = 0; 574 } 575 if (low != NULL && high != NULL && high != low) 576 sched_balance_pair(LIST_FIRST(&high->tdg_members), 577 LIST_FIRST(&low->tdg_members)); 578 TDQ_LOCK(tdq); 579 } 580 581 /* 582 * Balance load between CPUs in a group. Will only migrate within the group. 583 */ 584 static void 585 sched_balance_groups() 586 { 587 struct tdq *tdq; 588 int i; 589 590 /* 591 * Select a random time between .5 * balance_interval and 592 * 1.5 * balance_interval. 593 */ 594 balance_group_ticks = max(balance_interval / 2, 1); 595 balance_group_ticks += random() % balance_interval; 596 if (smp_started == 0 || rebalance == 0) 597 return; 598 tdq = TDQ_SELF(); 599 TDQ_UNLOCK(tdq); 600 for (i = 0; i <= tdg_maxid; i++) 601 sched_balance_group(TDQ_GROUP(i)); 602 TDQ_LOCK(tdq); 603 } 604 605 /* 606 * Finds the greatest imbalance between two tdqs in a group. 607 */ 608 static void 609 sched_balance_group(struct tdq_group *tdg) 610 { 611 struct tdq *tdq; 612 struct tdq *high; 613 struct tdq *low; 614 int load; 615 616 if (tdg->tdg_transferable == 0) 617 return; 618 low = NULL; 619 high = NULL; 620 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 621 load = tdq->tdq_load; 622 if (high == NULL || load > high->tdq_load) 623 high = tdq; 624 if (low == NULL || load < low->tdq_load) 625 low = tdq; 626 } 627 if (high != NULL && low != NULL && high != low) 628 sched_balance_pair(high, low); 629 } 630 631 /* 632 * Lock two thread queues using their address to maintain lock order. 633 */ 634 static void 635 tdq_lock_pair(struct tdq *one, struct tdq *two) 636 { 637 if (one < two) { 638 TDQ_LOCK(one); 639 TDQ_LOCK_FLAGS(two, MTX_DUPOK); 640 } else { 641 TDQ_LOCK(two); 642 TDQ_LOCK_FLAGS(one, MTX_DUPOK); 643 } 644 } 645 646 /* 647 * Unlock two thread queues. Order is not important here. 648 */ 649 static void 650 tdq_unlock_pair(struct tdq *one, struct tdq *two) 651 { 652 TDQ_UNLOCK(one); 653 TDQ_UNLOCK(two); 654 } 655 656 /* 657 * Transfer load between two imbalanced thread queues. 658 */ 659 static void 660 sched_balance_pair(struct tdq *high, struct tdq *low) 661 { 662 int transferable; 663 int high_load; 664 int low_load; 665 int move; 666 int diff; 667 int i; 668 669 tdq_lock_pair(high, low); 670 /* 671 * If we're transfering within a group we have to use this specific 672 * tdq's transferable count, otherwise we can steal from other members 673 * of the group. 674 */ 675 if (high->tdq_group == low->tdq_group) { 676 transferable = high->tdq_transferable; 677 high_load = high->tdq_load; 678 low_load = low->tdq_load; 679 } else { 680 transferable = high->tdq_group->tdg_transferable; 681 high_load = high->tdq_group->tdg_load; 682 low_load = low->tdq_group->tdg_load; 683 } 684 /* 685 * Determine what the imbalance is and then adjust that to how many 686 * threads we actually have to give up (transferable). 687 */ 688 if (transferable != 0) { 689 diff = high_load - low_load; 690 move = diff / 2; 691 if (diff & 0x1) 692 move++; 693 move = min(move, transferable); 694 for (i = 0; i < move; i++) 695 tdq_move(high, low); 696 /* 697 * IPI the target cpu to force it to reschedule with the new 698 * workload. 699 */ 700 ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT); 701 } 702 tdq_unlock_pair(high, low); 703 return; 704 } 705 706 /* 707 * Move a thread from one thread queue to another. 708 */ 709 static void 710 tdq_move(struct tdq *from, struct tdq *to) 711 { 712 struct td_sched *ts; 713 struct thread *td; 714 struct tdq *tdq; 715 int cpu; 716 717 TDQ_LOCK_ASSERT(from, MA_OWNED); 718 TDQ_LOCK_ASSERT(to, MA_OWNED); 719 720 tdq = from; 721 cpu = TDQ_ID(to); 722 ts = tdq_steal(tdq, cpu); 723 if (ts == NULL) { 724 struct tdq_group *tdg; 725 726 tdg = tdq->tdq_group; 727 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 728 if (tdq == from || tdq->tdq_transferable == 0) 729 continue; 730 ts = tdq_steal(tdq, cpu); 731 break; 732 } 733 if (ts == NULL) 734 return; 735 } 736 if (tdq == to) 737 return; 738 td = ts->ts_thread; 739 /* 740 * Although the run queue is locked the thread may be blocked. Lock 741 * it to clear this and acquire the run-queue lock. 742 */ 743 thread_lock(td); 744 /* Drop recursive lock on from acquired via thread_lock(). */ 745 TDQ_UNLOCK(from); 746 sched_rem(td); 747 ts->ts_cpu = cpu; 748 td->td_lock = TDQ_LOCKPTR(to); 749 tdq_add(to, td, SRQ_YIELDING); 750 } 751 752 /* 753 * This tdq has idled. Try to steal a thread from another cpu and switch 754 * to it. 755 */ 756 static int 757 tdq_idled(struct tdq *tdq) 758 { 759 struct tdq_group *tdg; 760 struct tdq *steal; 761 int highload; 762 int highcpu; 763 int cpu; 764 765 if (smp_started == 0 || steal_idle == 0) 766 return (1); 767 /* We don't want to be preempted while we're iterating over tdqs */ 768 spinlock_enter(); 769 tdg = tdq->tdq_group; 770 /* 771 * If we're in a cpu group, try and steal threads from another cpu in 772 * the group before idling. In a HTT group all cpus share the same 773 * run-queue lock, however, we still need a recursive lock to 774 * call tdq_move(). 775 */ 776 if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) { 777 TDQ_LOCK(tdq); 778 LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { 779 if (steal == tdq || steal->tdq_transferable == 0) 780 continue; 781 TDQ_LOCK(steal); 782 goto steal; 783 } 784 TDQ_UNLOCK(tdq); 785 } 786 /* 787 * Find the least loaded CPU with a transferable thread and attempt 788 * to steal it. We make a lockless pass and then verify that the 789 * thread is still available after locking. 790 */ 791 for (;;) { 792 highcpu = 0; 793 highload = 0; 794 for (cpu = 0; cpu <= mp_maxid; cpu++) { 795 if (CPU_ABSENT(cpu)) 796 continue; 797 steal = TDQ_CPU(cpu); 798 if (steal->tdq_transferable == 0) 799 continue; 800 if (steal->tdq_load < highload) 801 continue; 802 highload = steal->tdq_load; 803 highcpu = cpu; 804 } 805 if (highload < steal_thresh) 806 break; 807 steal = TDQ_CPU(highcpu); 808 if (steal == tdq) 809 break; 810 tdq_lock_pair(tdq, steal); 811 if (steal->tdq_load >= steal_thresh && steal->tdq_transferable) 812 goto steal; 813 tdq_unlock_pair(tdq, steal); 814 } 815 spinlock_exit(); 816 return (1); 817 steal: 818 spinlock_exit(); 819 tdq_move(steal, tdq); 820 TDQ_UNLOCK(steal); 821 mi_switch(SW_VOL, NULL); 822 thread_unlock(curthread); 823 824 return (0); 825 } 826 827 /* 828 * Notify a remote cpu of new work. Sends an IPI if criteria are met. 829 */ 830 static void 831 tdq_notify(struct td_sched *ts) 832 { 833 struct thread *ctd; 834 struct pcpu *pcpu; 835 int cpri; 836 int pri; 837 int cpu; 838 839 cpu = ts->ts_cpu; 840 pri = ts->ts_thread->td_priority; 841 pcpu = pcpu_find(cpu); 842 ctd = pcpu->pc_curthread; 843 cpri = ctd->td_priority; 844 845 /* 846 * If our priority is not better than the current priority there is 847 * nothing to do. 848 */ 849 if (pri > cpri) 850 return; 851 /* 852 * Always IPI idle. 853 */ 854 if (cpri > PRI_MIN_IDLE) 855 goto sendipi; 856 /* 857 * If we're interactive or better and there is non-interactive 858 * or worse running send an IPI. 859 */ 860 if (pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) 861 goto sendipi; 862 /* 863 * Otherwise only IPI if we exceed the threshold. 864 */ 865 if (pri > preempt_thresh) 866 return; 867 sendipi: 868 ipi_selected(1 << cpu, IPI_PREEMPT); 869 } 870 871 /* 872 * Steals load from a timeshare queue. Honors the rotating queue head 873 * index. 874 */ 875 static struct td_sched * 876 runq_steal_from(struct runq *rq, int cpu, u_char start) 877 { 878 struct td_sched *ts; 879 struct rqbits *rqb; 880 struct rqhead *rqh; 881 int first; 882 int bit; 883 int pri; 884 int i; 885 886 rqb = &rq->rq_status; 887 bit = start & (RQB_BPW -1); 888 pri = 0; 889 first = 0; 890 again: 891 for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { 892 if (rqb->rqb_bits[i] == 0) 893 continue; 894 if (bit != 0) { 895 for (pri = bit; pri < RQB_BPW; pri++) 896 if (rqb->rqb_bits[i] & (1ul << pri)) 897 break; 898 if (pri >= RQB_BPW) 899 continue; 900 } else 901 pri = RQB_FFS(rqb->rqb_bits[i]); 902 pri += (i << RQB_L2BPW); 903 rqh = &rq->rq_queues[pri]; 904 TAILQ_FOREACH(ts, rqh, ts_procq) { 905 if (first && THREAD_CAN_MIGRATE(ts->ts_thread) && 906 THREAD_CAN_SCHED(ts->ts_thread, cpu)) 907 return (ts); 908 first = 1; 909 } 910 } 911 if (start != 0) { 912 start = 0; 913 goto again; 914 } 915 916 return (NULL); 917 } 918 919 /* 920 * Steals load from a standard linear queue. 921 */ 922 static struct td_sched * 923 runq_steal(struct runq *rq, int cpu) 924 { 925 struct rqhead *rqh; 926 struct rqbits *rqb; 927 struct td_sched *ts; 928 int word; 929 int bit; 930 931 rqb = &rq->rq_status; 932 for (word = 0; word < RQB_LEN; word++) { 933 if (rqb->rqb_bits[word] == 0) 934 continue; 935 for (bit = 0; bit < RQB_BPW; bit++) { 936 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 937 continue; 938 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 939 TAILQ_FOREACH(ts, rqh, ts_procq) 940 if (THREAD_CAN_MIGRATE(ts->ts_thread) && 941 THREAD_CAN_SCHED(ts->ts_thread, cpu)) 942 return (ts); 943 } 944 } 945 return (NULL); 946 } 947 948 /* 949 * Attempt to steal a thread in priority order from a thread queue. 950 */ 951 static struct td_sched * 952 tdq_steal(struct tdq *tdq, int cpu) 953 { 954 struct td_sched *ts; 955 956 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 957 if ((ts = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) 958 return (ts); 959 if ((ts = runq_steal_from(&tdq->tdq_timeshare, 960 cpu, tdq->tdq_ridx)) != NULL) 961 return (ts); 962 return (runq_steal(&tdq->tdq_idle, cpu)); 963 } 964 965 /* 966 * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the 967 * current lock and returns with the assigned queue locked. 968 */ 969 static inline struct tdq * 970 sched_setcpu(struct td_sched *ts, int cpu, int flags) 971 { 972 struct thread *td; 973 struct tdq *tdq; 974 975 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 976 977 tdq = TDQ_CPU(cpu); 978 td = ts->ts_thread; 979 ts->ts_cpu = cpu; 980 981 /* If the lock matches just return the queue. */ 982 if (td->td_lock == TDQ_LOCKPTR(tdq)) 983 return (tdq); 984 #ifdef notyet 985 /* 986 * If the thread isn't running its lockptr is a 987 * turnstile or a sleepqueue. We can just lock_set without 988 * blocking. 989 */ 990 if (TD_CAN_RUN(td)) { 991 TDQ_LOCK(tdq); 992 thread_lock_set(td, TDQ_LOCKPTR(tdq)); 993 return (tdq); 994 } 995 #endif 996 /* 997 * The hard case, migration, we need to block the thread first to 998 * prevent order reversals with other cpus locks. 999 */ 1000 spinlock_enter(); 1001 thread_lock_block(td); 1002 TDQ_LOCK(tdq); 1003 thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); 1004 spinlock_exit(); 1005 return (tdq); 1006 } 1007 1008 /* 1009 * Find the thread queue running the lowest priority thread. 1010 */ 1011 static int 1012 tdq_lowestpri(struct thread *td) 1013 { 1014 struct tdq *tdq; 1015 int lowpri; 1016 int lowcpu; 1017 int lowload; 1018 int load; 1019 int cpu; 1020 int pri; 1021 1022 lowload = 0; 1023 lowpri = lowcpu = 0; 1024 for (cpu = 0; cpu <= mp_maxid; cpu++) { 1025 if (CPU_ABSENT(cpu)) 1026 continue; 1027 if (!THREAD_CAN_SCHED(td, cpu)) 1028 continue; 1029 tdq = TDQ_CPU(cpu); 1030 pri = tdq->tdq_lowpri; 1031 load = TDQ_CPU(cpu)->tdq_load; 1032 CTR4(KTR_ULE, 1033 "cpu %d pri %d lowcpu %d lowpri %d", 1034 cpu, pri, lowcpu, lowpri); 1035 if (pri < lowpri) 1036 continue; 1037 if (lowpri && lowpri == pri && load > lowload) 1038 continue; 1039 lowpri = pri; 1040 lowcpu = cpu; 1041 lowload = load; 1042 } 1043 1044 return (lowcpu); 1045 } 1046 1047 /* 1048 * Find the thread queue with the least load. 1049 */ 1050 static int 1051 tdq_lowestload(struct thread *td) 1052 { 1053 struct tdq *tdq; 1054 int lowload; 1055 int lowpri; 1056 int lowcpu; 1057 int load; 1058 int cpu; 1059 int pri; 1060 1061 lowcpu = 0; 1062 lowload = TDQ_CPU(0)->tdq_load; 1063 lowpri = TDQ_CPU(0)->tdq_lowpri; 1064 for (cpu = 1; cpu <= mp_maxid; cpu++) { 1065 if (CPU_ABSENT(cpu)) 1066 continue; 1067 if (!THREAD_CAN_SCHED(td, cpu)) 1068 continue; 1069 tdq = TDQ_CPU(cpu); 1070 load = tdq->tdq_load; 1071 pri = tdq->tdq_lowpri; 1072 CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d", 1073 cpu, load, lowcpu, lowload); 1074 if (load > lowload) 1075 continue; 1076 if (load == lowload && pri < lowpri) 1077 continue; 1078 lowcpu = cpu; 1079 lowload = load; 1080 lowpri = pri; 1081 } 1082 1083 return (lowcpu); 1084 } 1085 1086 /* 1087 * Pick the destination cpu for sched_add(). Respects affinity and makes 1088 * a determination based on load or priority of available processors. 1089 */ 1090 static int 1091 sched_pickcpu(struct thread *td, int flags) 1092 { 1093 struct tdq *tdq; 1094 struct td_sched *ts; 1095 cpumask_t mask; 1096 int self; 1097 int pri; 1098 int cpu; 1099 1100 self = PCPU_GET(cpuid); 1101 ts = td->td_sched; 1102 if (smp_started == 0) 1103 return (self); 1104 /* 1105 * Don't migrate a running thread from sched_switch(). 1106 */ 1107 if (flags & SRQ_OURSELF) { 1108 CTR1(KTR_ULE, "YIELDING %d", 1109 curthread->td_priority); 1110 return (self); 1111 } 1112 pri = ts->ts_thread->td_priority; 1113 cpu = ts->ts_cpu; 1114 if (THREAD_CAN_SCHED(td, cpu)) { 1115 /* 1116 * Regardless of affinity, if the last cpu is idle 1117 * send it there. 1118 */ 1119 tdq = TDQ_CPU(cpu); 1120 if (tdq->tdq_lowpri > PRI_MIN_IDLE) { 1121 CTR5(KTR_ULE, 1122 "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d", 1123 ts->ts_cpu, ts->ts_rltick, ticks, pri, 1124 tdq->tdq_lowpri); 1125 return (ts->ts_cpu); 1126 } 1127 /* 1128 * If we have affinity, try to place it on the cpu we 1129 * last ran on. 1130 */ 1131 if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) { 1132 CTR5(KTR_ULE, 1133 "affinity for %d, ltick %d ticks %d pri %d curthread %d", 1134 ts->ts_cpu, ts->ts_rltick, ticks, pri, 1135 tdq->tdq_lowpri); 1136 return (ts->ts_cpu); 1137 } 1138 } 1139 1140 /* 1141 * Look for an idle group. 1142 */ 1143 CTR1(KTR_ULE, "tdq_idle %X", tdq_idle); 1144 mask = tdq_idle; 1145 while ((cpu = ffs(mask)) != 0) { 1146 --cpu; 1147 if (THREAD_CAN_SCHED(td, cpu)) 1148 return (cpu); 1149 mask &= ~(1 << cpu); 1150 } 1151 /* 1152 * If there are no idle cores see if we can run the thread locally. 1153 * This may improve locality among sleepers and wakers when there 1154 * is shared data. 1155 */ 1156 if (tryself && THREAD_CAN_SCHED(td, self) && 1157 pri < curthread->td_priority) { 1158 CTR1(KTR_ULE, "tryself %d", 1159 curthread->td_priority); 1160 return (self); 1161 } 1162 /* 1163 * Now search for the cpu running the lowest priority thread with 1164 * the least load. 1165 */ 1166 if (pick_pri) 1167 cpu = tdq_lowestpri(td); 1168 else 1169 cpu = tdq_lowestload(td); 1170 return (cpu); 1171 } 1172 1173 #endif /* SMP */ 1174 1175 /* 1176 * Pick the highest priority task we have and return it. 1177 */ 1178 static struct td_sched * 1179 tdq_choose(struct tdq *tdq) 1180 { 1181 struct td_sched *ts; 1182 1183 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1184 ts = runq_choose(&tdq->tdq_realtime); 1185 if (ts != NULL) 1186 return (ts); 1187 ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); 1188 if (ts != NULL) { 1189 KASSERT(ts->ts_thread->td_priority >= PRI_MIN_BATCH, 1190 ("tdq_choose: Invalid priority on timeshare queue %d", 1191 ts->ts_thread->td_priority)); 1192 return (ts); 1193 } 1194 1195 ts = runq_choose(&tdq->tdq_idle); 1196 if (ts != NULL) { 1197 KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, 1198 ("tdq_choose: Invalid priority on idle queue %d", 1199 ts->ts_thread->td_priority)); 1200 return (ts); 1201 } 1202 1203 return (NULL); 1204 } 1205 1206 /* 1207 * Initialize a thread queue. 1208 */ 1209 static void 1210 tdq_setup(struct tdq *tdq) 1211 { 1212 1213 if (bootverbose) 1214 printf("ULE: setup cpu %d\n", TDQ_ID(tdq)); 1215 runq_init(&tdq->tdq_realtime); 1216 runq_init(&tdq->tdq_timeshare); 1217 runq_init(&tdq->tdq_idle); 1218 tdq->tdq_load = 0; 1219 } 1220 1221 #ifdef SMP 1222 static void 1223 tdg_setup(struct tdq_group *tdg) 1224 { 1225 if (bootverbose) 1226 printf("ULE: setup cpu group %d\n", TDG_ID(tdg)); 1227 snprintf(tdg->tdg_name, sizeof(tdg->tdg_name), 1228 "sched lock %d", (int)TDG_ID(tdg)); 1229 mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock", 1230 MTX_SPIN | MTX_RECURSE); 1231 LIST_INIT(&tdg->tdg_members); 1232 tdg->tdg_load = 0; 1233 tdg->tdg_transferable = 0; 1234 tdg->tdg_cpus = 0; 1235 tdg->tdg_mask = 0; 1236 tdg->tdg_cpumask = 0; 1237 tdg->tdg_idlemask = 0; 1238 } 1239 1240 static void 1241 tdg_add(struct tdq_group *tdg, struct tdq *tdq) 1242 { 1243 if (tdg->tdg_mask == 0) 1244 tdg->tdg_mask |= 1 << TDQ_ID(tdq); 1245 tdg->tdg_cpumask |= 1 << TDQ_ID(tdq); 1246 tdg->tdg_cpus++; 1247 tdq->tdq_group = tdg; 1248 tdq->tdq_lock = &tdg->tdg_lock; 1249 LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); 1250 if (bootverbose) 1251 printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n", 1252 TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask); 1253 } 1254 1255 static void 1256 sched_setup_topology(void) 1257 { 1258 struct tdq_group *tdg; 1259 struct cpu_group *cg; 1260 int balance_groups; 1261 struct tdq *tdq; 1262 int i; 1263 int j; 1264 1265 topology = 1; 1266 balance_groups = 0; 1267 for (i = 0; i < smp_topology->ct_count; i++) { 1268 cg = &smp_topology->ct_group[i]; 1269 tdg = &tdq_groups[i]; 1270 /* 1271 * Initialize the group. 1272 */ 1273 tdg_setup(tdg); 1274 /* 1275 * Find all of the group members and add them. 1276 */ 1277 for (j = 0; j < MAXCPU; j++) { 1278 if ((cg->cg_mask & (1 << j)) != 0) { 1279 tdq = TDQ_CPU(j); 1280 tdq_setup(tdq); 1281 tdg_add(tdg, tdq); 1282 } 1283 } 1284 if (tdg->tdg_cpus > 1) 1285 balance_groups = 1; 1286 } 1287 tdg_maxid = smp_topology->ct_count - 1; 1288 if (balance_groups) 1289 sched_balance_groups(); 1290 } 1291 1292 static void 1293 sched_setup_smp(void) 1294 { 1295 struct tdq_group *tdg; 1296 struct tdq *tdq; 1297 int cpus; 1298 int i; 1299 1300 for (cpus = 0, i = 0; i < MAXCPU; i++) { 1301 if (CPU_ABSENT(i)) 1302 continue; 1303 tdq = &tdq_cpu[i]; 1304 tdg = &tdq_groups[i]; 1305 /* 1306 * Setup a tdq group with one member. 1307 */ 1308 tdg_setup(tdg); 1309 tdq_setup(tdq); 1310 tdg_add(tdg, tdq); 1311 cpus++; 1312 } 1313 tdg_maxid = cpus - 1; 1314 } 1315 1316 /* 1317 * Fake a topology with one group containing all CPUs. 1318 */ 1319 static void 1320 sched_fake_topo(void) 1321 { 1322 #ifdef SCHED_FAKE_TOPOLOGY 1323 static struct cpu_top top; 1324 static struct cpu_group group; 1325 1326 top.ct_count = 1; 1327 top.ct_group = &group; 1328 group.cg_mask = all_cpus; 1329 group.cg_count = mp_ncpus; 1330 group.cg_children = 0; 1331 smp_topology = &top; 1332 #endif 1333 } 1334 #endif 1335 1336 /* 1337 * Setup the thread queues and initialize the topology based on MD 1338 * information. 1339 */ 1340 static void 1341 sched_setup(void *dummy) 1342 { 1343 struct tdq *tdq; 1344 1345 tdq = TDQ_SELF(); 1346 #ifdef SMP 1347 sched_fake_topo(); 1348 /* 1349 * Setup tdqs based on a topology configuration or vanilla SMP based 1350 * on mp_maxid. 1351 */ 1352 if (smp_topology == NULL) 1353 sched_setup_smp(); 1354 else 1355 sched_setup_topology(); 1356 balance_tdq = tdq; 1357 sched_balance(); 1358 #else 1359 tdq_setup(tdq); 1360 mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE); 1361 tdq->tdq_lock = &tdq_lock; 1362 #endif 1363 /* 1364 * To avoid divide-by-zero, we set realstathz a dummy value 1365 * in case which sched_clock() called before sched_initticks(). 1366 */ 1367 realstathz = hz; 1368 sched_slice = (realstathz/10); /* ~100ms */ 1369 tickincr = 1 << SCHED_TICK_SHIFT; 1370 1371 /* Add thread0's load since it's running. */ 1372 TDQ_LOCK(tdq); 1373 thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1374 tdq_load_add(tdq, &td_sched0); 1375 TDQ_UNLOCK(tdq); 1376 } 1377 1378 /* 1379 * This routine determines the tickincr after stathz and hz are setup. 1380 */ 1381 /* ARGSUSED */ 1382 static void 1383 sched_initticks(void *dummy) 1384 { 1385 int incr; 1386 1387 realstathz = stathz ? stathz : hz; 1388 sched_slice = (realstathz/10); /* ~100ms */ 1389 1390 /* 1391 * tickincr is shifted out by 10 to avoid rounding errors due to 1392 * hz not being evenly divisible by stathz on all platforms. 1393 */ 1394 incr = (hz << SCHED_TICK_SHIFT) / realstathz; 1395 /* 1396 * This does not work for values of stathz that are more than 1397 * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1398 */ 1399 if (incr == 0) 1400 incr = 1; 1401 tickincr = incr; 1402 #ifdef SMP 1403 /* 1404 * Set the default balance interval now that we know 1405 * what realstathz is. 1406 */ 1407 balance_interval = realstathz; 1408 /* 1409 * Set steal thresh to roughly log2(mp_ncpu) but no greater than 4. 1410 * This prevents excess thrashing on large machines and excess idle 1411 * on smaller machines. 1412 */ 1413 steal_thresh = min(fls(mp_ncpus) - 1, 3); 1414 affinity = SCHED_AFFINITY_DEFAULT; 1415 #endif 1416 } 1417 1418 1419 /* 1420 * This is the core of the interactivity algorithm. Determines a score based 1421 * on past behavior. It is the ratio of sleep time to run time scaled to 1422 * a [0, 100] integer. This is the voluntary sleep time of a process, which 1423 * differs from the cpu usage because it does not account for time spent 1424 * waiting on a run-queue. Would be prettier if we had floating point. 1425 */ 1426 static int 1427 sched_interact_score(struct thread *td) 1428 { 1429 struct td_sched *ts; 1430 int div; 1431 1432 ts = td->td_sched; 1433 /* 1434 * The score is only needed if this is likely to be an interactive 1435 * task. Don't go through the expense of computing it if there's 1436 * no chance. 1437 */ 1438 if (sched_interact <= SCHED_INTERACT_HALF && 1439 ts->ts_runtime >= ts->ts_slptime) 1440 return (SCHED_INTERACT_HALF); 1441 1442 if (ts->ts_runtime > ts->ts_slptime) { 1443 div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); 1444 return (SCHED_INTERACT_HALF + 1445 (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); 1446 } 1447 if (ts->ts_slptime > ts->ts_runtime) { 1448 div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); 1449 return (ts->ts_runtime / div); 1450 } 1451 /* runtime == slptime */ 1452 if (ts->ts_runtime) 1453 return (SCHED_INTERACT_HALF); 1454 1455 /* 1456 * This can happen if slptime and runtime are 0. 1457 */ 1458 return (0); 1459 1460 } 1461 1462 /* 1463 * Scale the scheduling priority according to the "interactivity" of this 1464 * process. 1465 */ 1466 static void 1467 sched_priority(struct thread *td) 1468 { 1469 int score; 1470 int pri; 1471 1472 if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) 1473 return; 1474 /* 1475 * If the score is interactive we place the thread in the realtime 1476 * queue with a priority that is less than kernel and interrupt 1477 * priorities. These threads are not subject to nice restrictions. 1478 * 1479 * Scores greater than this are placed on the normal timeshare queue 1480 * where the priority is partially decided by the most recent cpu 1481 * utilization and the rest is decided by nice value. 1482 * 1483 * The nice value of the process has a linear effect on the calculated 1484 * score. Negative nice values make it easier for a thread to be 1485 * considered interactive. 1486 */ 1487 score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); 1488 if (score < sched_interact) { 1489 pri = PRI_MIN_INTERACT; 1490 pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) / 1491 sched_interact) * score; 1492 KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, 1493 ("sched_priority: invalid interactive priority %d score %d", 1494 pri, score)); 1495 } else { 1496 pri = SCHED_PRI_MIN; 1497 if (td->td_sched->ts_ticks) 1498 pri += SCHED_PRI_TICKS(td->td_sched); 1499 pri += SCHED_PRI_NICE(td->td_proc->p_nice); 1500 KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, 1501 ("sched_priority: invalid priority %d: nice %d, " 1502 "ticks %d ftick %d ltick %d tick pri %d", 1503 pri, td->td_proc->p_nice, td->td_sched->ts_ticks, 1504 td->td_sched->ts_ftick, td->td_sched->ts_ltick, 1505 SCHED_PRI_TICKS(td->td_sched))); 1506 } 1507 sched_user_prio(td, pri); 1508 1509 return; 1510 } 1511 1512 /* 1513 * This routine enforces a maximum limit on the amount of scheduling history 1514 * kept. It is called after either the slptime or runtime is adjusted. This 1515 * function is ugly due to integer math. 1516 */ 1517 static void 1518 sched_interact_update(struct thread *td) 1519 { 1520 struct td_sched *ts; 1521 u_int sum; 1522 1523 ts = td->td_sched; 1524 sum = ts->ts_runtime + ts->ts_slptime; 1525 if (sum < SCHED_SLP_RUN_MAX) 1526 return; 1527 /* 1528 * This only happens from two places: 1529 * 1) We have added an unusual amount of run time from fork_exit. 1530 * 2) We have added an unusual amount of sleep time from sched_sleep(). 1531 */ 1532 if (sum > SCHED_SLP_RUN_MAX * 2) { 1533 if (ts->ts_runtime > ts->ts_slptime) { 1534 ts->ts_runtime = SCHED_SLP_RUN_MAX; 1535 ts->ts_slptime = 1; 1536 } else { 1537 ts->ts_slptime = SCHED_SLP_RUN_MAX; 1538 ts->ts_runtime = 1; 1539 } 1540 return; 1541 } 1542 /* 1543 * If we have exceeded by more than 1/5th then the algorithm below 1544 * will not bring us back into range. Dividing by two here forces 1545 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1546 */ 1547 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1548 ts->ts_runtime /= 2; 1549 ts->ts_slptime /= 2; 1550 return; 1551 } 1552 ts->ts_runtime = (ts->ts_runtime / 5) * 4; 1553 ts->ts_slptime = (ts->ts_slptime / 5) * 4; 1554 } 1555 1556 /* 1557 * Scale back the interactivity history when a child thread is created. The 1558 * history is inherited from the parent but the thread may behave totally 1559 * differently. For example, a shell spawning a compiler process. We want 1560 * to learn that the compiler is behaving badly very quickly. 1561 */ 1562 static void 1563 sched_interact_fork(struct thread *td) 1564 { 1565 int ratio; 1566 int sum; 1567 1568 sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime; 1569 if (sum > SCHED_SLP_RUN_FORK) { 1570 ratio = sum / SCHED_SLP_RUN_FORK; 1571 td->td_sched->ts_runtime /= ratio; 1572 td->td_sched->ts_slptime /= ratio; 1573 } 1574 } 1575 1576 /* 1577 * Called from proc0_init() to setup the scheduler fields. 1578 */ 1579 void 1580 schedinit(void) 1581 { 1582 1583 /* 1584 * Set up the scheduler specific parts of proc0. 1585 */ 1586 proc0.p_sched = NULL; /* XXX */ 1587 thread0.td_sched = &td_sched0; 1588 td_sched0.ts_ltick = ticks; 1589 td_sched0.ts_ftick = ticks; 1590 td_sched0.ts_thread = &thread0; 1591 } 1592 1593 /* 1594 * This is only somewhat accurate since given many processes of the same 1595 * priority they will switch when their slices run out, which will be 1596 * at most sched_slice stathz ticks. 1597 */ 1598 int 1599 sched_rr_interval(void) 1600 { 1601 1602 /* Convert sched_slice to hz */ 1603 return (hz/(realstathz/sched_slice)); 1604 } 1605 1606 /* 1607 * Update the percent cpu tracking information when it is requested or 1608 * the total history exceeds the maximum. We keep a sliding history of 1609 * tick counts that slowly decays. This is less precise than the 4BSD 1610 * mechanism since it happens with less regular and frequent events. 1611 */ 1612 static void 1613 sched_pctcpu_update(struct td_sched *ts) 1614 { 1615 1616 if (ts->ts_ticks == 0) 1617 return; 1618 if (ticks - (hz / 10) < ts->ts_ltick && 1619 SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX) 1620 return; 1621 /* 1622 * Adjust counters and watermark for pctcpu calc. 1623 */ 1624 if (ts->ts_ltick > ticks - SCHED_TICK_TARG) 1625 ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * 1626 SCHED_TICK_TARG; 1627 else 1628 ts->ts_ticks = 0; 1629 ts->ts_ltick = ticks; 1630 ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; 1631 } 1632 1633 /* 1634 * Adjust the priority of a thread. Move it to the appropriate run-queue 1635 * if necessary. This is the back-end for several priority related 1636 * functions. 1637 */ 1638 static void 1639 sched_thread_priority(struct thread *td, u_char prio) 1640 { 1641 struct td_sched *ts; 1642 1643 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1644 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1645 curthread->td_proc->p_comm); 1646 ts = td->td_sched; 1647 THREAD_LOCK_ASSERT(td, MA_OWNED); 1648 if (td->td_priority == prio) 1649 return; 1650 1651 if (TD_ON_RUNQ(td) && prio < td->td_priority) { 1652 /* 1653 * If the priority has been elevated due to priority 1654 * propagation, we may have to move ourselves to a new 1655 * queue. This could be optimized to not re-add in some 1656 * cases. 1657 */ 1658 sched_rem(td); 1659 td->td_priority = prio; 1660 sched_add(td, SRQ_BORROWING); 1661 } else { 1662 #ifdef SMP 1663 struct tdq *tdq; 1664 1665 tdq = TDQ_CPU(ts->ts_cpu); 1666 if (prio < tdq->tdq_lowpri) 1667 tdq->tdq_lowpri = prio; 1668 #endif 1669 td->td_priority = prio; 1670 } 1671 } 1672 1673 /* 1674 * Update a thread's priority when it is lent another thread's 1675 * priority. 1676 */ 1677 void 1678 sched_lend_prio(struct thread *td, u_char prio) 1679 { 1680 1681 td->td_flags |= TDF_BORROWING; 1682 sched_thread_priority(td, prio); 1683 } 1684 1685 /* 1686 * Restore a thread's priority when priority propagation is 1687 * over. The prio argument is the minimum priority the thread 1688 * needs to have to satisfy other possible priority lending 1689 * requests. If the thread's regular priority is less 1690 * important than prio, the thread will keep a priority boost 1691 * of prio. 1692 */ 1693 void 1694 sched_unlend_prio(struct thread *td, u_char prio) 1695 { 1696 u_char base_pri; 1697 1698 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1699 td->td_base_pri <= PRI_MAX_TIMESHARE) 1700 base_pri = td->td_user_pri; 1701 else 1702 base_pri = td->td_base_pri; 1703 if (prio >= base_pri) { 1704 td->td_flags &= ~TDF_BORROWING; 1705 sched_thread_priority(td, base_pri); 1706 } else 1707 sched_lend_prio(td, prio); 1708 } 1709 1710 /* 1711 * Standard entry for setting the priority to an absolute value. 1712 */ 1713 void 1714 sched_prio(struct thread *td, u_char prio) 1715 { 1716 u_char oldprio; 1717 1718 /* First, update the base priority. */ 1719 td->td_base_pri = prio; 1720 1721 /* 1722 * If the thread is borrowing another thread's priority, don't 1723 * ever lower the priority. 1724 */ 1725 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1726 return; 1727 1728 /* Change the real priority. */ 1729 oldprio = td->td_priority; 1730 sched_thread_priority(td, prio); 1731 1732 /* 1733 * If the thread is on a turnstile, then let the turnstile update 1734 * its state. 1735 */ 1736 if (TD_ON_LOCK(td) && oldprio != prio) 1737 turnstile_adjust(td, oldprio); 1738 } 1739 1740 /* 1741 * Set the base user priority, does not effect current running priority. 1742 */ 1743 void 1744 sched_user_prio(struct thread *td, u_char prio) 1745 { 1746 u_char oldprio; 1747 1748 td->td_base_user_pri = prio; 1749 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 1750 return; 1751 oldprio = td->td_user_pri; 1752 td->td_user_pri = prio; 1753 } 1754 1755 void 1756 sched_lend_user_prio(struct thread *td, u_char prio) 1757 { 1758 u_char oldprio; 1759 1760 THREAD_LOCK_ASSERT(td, MA_OWNED); 1761 td->td_flags |= TDF_UBORROWING; 1762 oldprio = td->td_user_pri; 1763 td->td_user_pri = prio; 1764 } 1765 1766 void 1767 sched_unlend_user_prio(struct thread *td, u_char prio) 1768 { 1769 u_char base_pri; 1770 1771 THREAD_LOCK_ASSERT(td, MA_OWNED); 1772 base_pri = td->td_base_user_pri; 1773 if (prio >= base_pri) { 1774 td->td_flags &= ~TDF_UBORROWING; 1775 sched_user_prio(td, base_pri); 1776 } else { 1777 sched_lend_user_prio(td, prio); 1778 } 1779 } 1780 1781 /* 1782 * Add the thread passed as 'newtd' to the run queue before selecting 1783 * the next thread to run. This is only used for KSE. 1784 */ 1785 static void 1786 sched_switchin(struct tdq *tdq, struct thread *td) 1787 { 1788 #ifdef SMP 1789 spinlock_enter(); 1790 TDQ_UNLOCK(tdq); 1791 thread_lock(td); 1792 spinlock_exit(); 1793 sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING); 1794 #else 1795 td->td_lock = TDQ_LOCKPTR(tdq); 1796 #endif 1797 tdq_add(tdq, td, SRQ_YIELDING); 1798 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1799 } 1800 1801 /* 1802 * Handle migration from sched_switch(). This happens only for 1803 * cpu binding. 1804 */ 1805 static struct mtx * 1806 sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) 1807 { 1808 struct tdq *tdn; 1809 1810 tdn = TDQ_CPU(td->td_sched->ts_cpu); 1811 #ifdef SMP 1812 /* 1813 * Do the lock dance required to avoid LOR. We grab an extra 1814 * spinlock nesting to prevent preemption while we're 1815 * not holding either run-queue lock. 1816 */ 1817 spinlock_enter(); 1818 thread_lock_block(td); /* This releases the lock on tdq. */ 1819 1820 /* 1821 * Acquire both run-queue locks before placing the thread on the new 1822 * run-queue to avoid deadlocks created by placing a thread with a 1823 * blocked lock on the run-queue of a remote processor. The deadlock 1824 * occurs when a third processor attempts to lock the two queues in 1825 * question while the target processor is spinning with its own 1826 * run-queue lock held while waiting for the blocked lock to clear. 1827 */ 1828 if (TDQ_LOCKPTR(tdn) == TDQ_LOCKPTR(tdq)) { 1829 TDQ_LOCK(tdq); 1830 tdq_add(tdn, td, flags); 1831 tdq_notify(td->td_sched); 1832 } else { 1833 tdq_lock_pair(tdn, tdq); 1834 tdq_add(tdn, td, flags); 1835 tdq_notify(td->td_sched); 1836 TDQ_UNLOCK(tdn); 1837 } 1838 spinlock_exit(); 1839 #endif 1840 return (TDQ_LOCKPTR(tdn)); 1841 } 1842 1843 /* 1844 * Variadic version of thread_lock_unblock() that does not assume td_lock 1845 * is blocked. 1846 */ 1847 static inline void 1848 thread_unblock_switch(struct thread *td, struct mtx *mtx) 1849 { 1850 atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, 1851 (uintptr_t)mtx); 1852 } 1853 1854 /* 1855 * Switch threads. This function has to handle threads coming in while 1856 * blocked for some reason, running, or idle. It also must deal with 1857 * migrating a thread from one queue to another as running threads may 1858 * be assigned elsewhere via binding. 1859 */ 1860 void 1861 sched_switch(struct thread *td, struct thread *newtd, int flags) 1862 { 1863 struct tdq *tdq; 1864 struct td_sched *ts; 1865 struct mtx *mtx; 1866 int srqflag; 1867 int cpuid; 1868 1869 THREAD_LOCK_ASSERT(td, MA_OWNED); 1870 1871 cpuid = PCPU_GET(cpuid); 1872 tdq = TDQ_CPU(cpuid); 1873 ts = td->td_sched; 1874 mtx = td->td_lock; 1875 #ifdef SMP 1876 ts->ts_rltick = ticks; 1877 if (newtd && newtd->td_priority < tdq->tdq_lowpri) 1878 tdq->tdq_lowpri = newtd->td_priority; 1879 #endif 1880 td->td_lastcpu = td->td_oncpu; 1881 td->td_oncpu = NOCPU; 1882 if (!(flags & SW_PREEMPT)) 1883 td->td_flags &= ~TDF_NEEDRESCHED; 1884 td->td_owepreempt = 0; 1885 /* 1886 * The lock pointer in an idle thread should never change. Reset it 1887 * to CAN_RUN as well. 1888 */ 1889 if (TD_IS_IDLETHREAD(td)) { 1890 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1891 TD_SET_CAN_RUN(td); 1892 } else if (TD_IS_RUNNING(td)) { 1893 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1894 tdq_load_rem(tdq, ts); 1895 srqflag = (flags & SW_PREEMPT) ? 1896 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1897 SRQ_OURSELF|SRQ_YIELDING; 1898 #ifdef SMP 1899 if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu)) 1900 ts->ts_cpu = sched_pickcpu(td, 0); 1901 #endif 1902 if (ts->ts_cpu == cpuid) 1903 tdq_add(tdq, td, srqflag); 1904 else { 1905 KASSERT(THREAD_CAN_MIGRATE(td) || 1906 (ts->ts_flags & TSF_BOUND) != 0, 1907 ("Thread %p shouldn't migrate", td)); 1908 mtx = sched_switch_migrate(tdq, td, srqflag); 1909 } 1910 } else { 1911 /* This thread must be going to sleep. */ 1912 TDQ_LOCK(tdq); 1913 mtx = thread_lock_block(td); 1914 tdq_load_rem(tdq, ts); 1915 } 1916 /* 1917 * We enter here with the thread blocked and assigned to the 1918 * appropriate cpu run-queue or sleep-queue and with the current 1919 * thread-queue locked. 1920 */ 1921 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 1922 /* 1923 * If KSE assigned a new thread just add it here and let choosethread 1924 * select the best one. 1925 */ 1926 if (newtd != NULL) 1927 sched_switchin(tdq, newtd); 1928 newtd = choosethread(); 1929 /* 1930 * Call the MD code to switch contexts if necessary. 1931 */ 1932 if (td != newtd) { 1933 #ifdef HWPMC_HOOKS 1934 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1935 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1936 #endif 1937 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; 1938 1939 #ifdef KDTRACE_HOOKS 1940 /* 1941 * If DTrace has set the active vtime enum to anything 1942 * other than INACTIVE (0), then it should have set the 1943 * function to call. 1944 */ 1945 if (dtrace_vtime_active) 1946 (*dtrace_vtime_switch_func)(newtd); 1947 #endif 1948 cpu_switch(td, newtd, mtx); 1949 /* 1950 * We may return from cpu_switch on a different cpu. However, 1951 * we always return with td_lock pointing to the current cpu's 1952 * run queue lock. 1953 */ 1954 cpuid = PCPU_GET(cpuid); 1955 tdq = TDQ_CPU(cpuid); 1956 #ifdef HWPMC_HOOKS 1957 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1958 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1959 #endif 1960 } else 1961 thread_unblock_switch(td, mtx); 1962 /* 1963 * Assert that all went well and return. 1964 */ 1965 #ifdef SMP 1966 /* We should always get here with the lowest priority td possible */ 1967 tdq->tdq_lowpri = td->td_priority; 1968 #endif 1969 TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); 1970 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1971 td->td_oncpu = cpuid; 1972 } 1973 1974 /* 1975 * Adjust thread priorities as a result of a nice request. 1976 */ 1977 void 1978 sched_nice(struct proc *p, int nice) 1979 { 1980 struct thread *td; 1981 1982 PROC_LOCK_ASSERT(p, MA_OWNED); 1983 PROC_SLOCK_ASSERT(p, MA_OWNED); 1984 1985 p->p_nice = nice; 1986 FOREACH_THREAD_IN_PROC(p, td) { 1987 thread_lock(td); 1988 sched_priority(td); 1989 sched_prio(td, td->td_base_user_pri); 1990 thread_unlock(td); 1991 } 1992 } 1993 1994 /* 1995 * Record the sleep time for the interactivity scorer. 1996 */ 1997 void 1998 sched_sleep(struct thread *td) 1999 { 2000 2001 THREAD_LOCK_ASSERT(td, MA_OWNED); 2002 2003 td->td_slptick = ticks; 2004 } 2005 2006 /* 2007 * Schedule a thread to resume execution and record how long it voluntarily 2008 * slept. We also update the pctcpu, interactivity, and priority. 2009 */ 2010 void 2011 sched_wakeup(struct thread *td) 2012 { 2013 struct td_sched *ts; 2014 int slptick; 2015 2016 THREAD_LOCK_ASSERT(td, MA_OWNED); 2017 ts = td->td_sched; 2018 /* 2019 * If we slept for more than a tick update our interactivity and 2020 * priority. 2021 */ 2022 slptick = td->td_slptick; 2023 td->td_slptick = 0; 2024 if (slptick && slptick != ticks) { 2025 u_int hzticks; 2026 2027 hzticks = (ticks - slptick) << SCHED_TICK_SHIFT; 2028 ts->ts_slptime += hzticks; 2029 sched_interact_update(td); 2030 sched_pctcpu_update(ts); 2031 sched_priority(td); 2032 } 2033 /* Reset the slice value after we sleep. */ 2034 ts->ts_slice = sched_slice; 2035 sched_add(td, SRQ_BORING); 2036 } 2037 2038 /* 2039 * Penalize the parent for creating a new child and initialize the child's 2040 * priority. 2041 */ 2042 void 2043 sched_fork(struct thread *td, struct thread *child) 2044 { 2045 THREAD_LOCK_ASSERT(td, MA_OWNED); 2046 sched_fork_thread(td, child); 2047 /* 2048 * Penalize the parent and child for forking. 2049 */ 2050 sched_interact_fork(child); 2051 sched_priority(child); 2052 td->td_sched->ts_runtime += tickincr; 2053 sched_interact_update(td); 2054 sched_priority(td); 2055 } 2056 2057 /* 2058 * Fork a new thread, may be within the same process. 2059 */ 2060 void 2061 sched_fork_thread(struct thread *td, struct thread *child) 2062 { 2063 struct td_sched *ts; 2064 struct td_sched *ts2; 2065 2066 /* 2067 * Initialize child. 2068 */ 2069 THREAD_LOCK_ASSERT(td, MA_OWNED); 2070 sched_newthread(child); 2071 child->td_lock = TDQ_LOCKPTR(TDQ_SELF()); 2072 child->td_cpuset = cpuset_ref(td->td_cpuset); 2073 ts = td->td_sched; 2074 ts2 = child->td_sched; 2075 ts2->ts_cpu = ts->ts_cpu; 2076 ts2->ts_runq = NULL; 2077 /* 2078 * Grab our parents cpu estimation information. 2079 */ 2080 ts2->ts_ticks = ts->ts_ticks; 2081 ts2->ts_ltick = ts->ts_ltick; 2082 ts2->ts_incrtick = ts->ts_incrtick; 2083 ts2->ts_ftick = ts->ts_ftick; 2084 /* 2085 * Do not inherit any borrowed priority from the parent. 2086 */ 2087 child->td_priority = child->td_base_pri; 2088 /* 2089 * And update interactivity score. 2090 */ 2091 ts2->ts_slptime = ts->ts_slptime; 2092 ts2->ts_runtime = ts->ts_runtime; 2093 ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ 2094 } 2095 2096 /* 2097 * Adjust the priority class of a thread. 2098 */ 2099 void 2100 sched_class(struct thread *td, int class) 2101 { 2102 2103 THREAD_LOCK_ASSERT(td, MA_OWNED); 2104 if (td->td_pri_class == class) 2105 return; 2106 2107 #ifdef SMP 2108 /* 2109 * On SMP if we're on the RUNQ we must adjust the transferable 2110 * count because could be changing to or from an interrupt 2111 * class. 2112 */ 2113 if (TD_ON_RUNQ(td)) { 2114 struct tdq *tdq; 2115 2116 tdq = TDQ_CPU(td->td_sched->ts_cpu); 2117 if (THREAD_CAN_MIGRATE(td)) { 2118 tdq->tdq_transferable--; 2119 tdq->tdq_group->tdg_transferable--; 2120 } 2121 td->td_pri_class = class; 2122 if (THREAD_CAN_MIGRATE(td)) { 2123 tdq->tdq_transferable++; 2124 tdq->tdq_group->tdg_transferable++; 2125 } 2126 } 2127 #endif 2128 td->td_pri_class = class; 2129 } 2130 2131 /* 2132 * Return some of the child's priority and interactivity to the parent. 2133 */ 2134 void 2135 sched_exit(struct proc *p, struct thread *child) 2136 { 2137 struct thread *td; 2138 2139 CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 2140 child, child->td_proc->p_comm, child->td_priority); 2141 2142 PROC_SLOCK_ASSERT(p, MA_OWNED); 2143 td = FIRST_THREAD_IN_PROC(p); 2144 sched_exit_thread(td, child); 2145 } 2146 2147 /* 2148 * Penalize another thread for the time spent on this one. This helps to 2149 * worsen the priority and interactivity of processes which schedule batch 2150 * jobs such as make. This has little effect on the make process itself but 2151 * causes new processes spawned by it to receive worse scores immediately. 2152 */ 2153 void 2154 sched_exit_thread(struct thread *td, struct thread *child) 2155 { 2156 2157 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 2158 child, child->td_proc->p_comm, child->td_priority); 2159 2160 #ifdef KSE 2161 /* 2162 * KSE forks and exits so often that this penalty causes short-lived 2163 * threads to always be non-interactive. This causes mozilla to 2164 * crawl under load. 2165 */ 2166 if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc) 2167 return; 2168 #endif 2169 /* 2170 * Give the child's runtime to the parent without returning the 2171 * sleep time as a penalty to the parent. This causes shells that 2172 * launch expensive things to mark their children as expensive. 2173 */ 2174 thread_lock(td); 2175 td->td_sched->ts_runtime += child->td_sched->ts_runtime; 2176 sched_interact_update(td); 2177 sched_priority(td); 2178 thread_unlock(td); 2179 } 2180 2181 /* 2182 * Fix priorities on return to user-space. Priorities may be elevated due 2183 * to static priorities in msleep() or similar. 2184 */ 2185 void 2186 sched_userret(struct thread *td) 2187 { 2188 /* 2189 * XXX we cheat slightly on the locking here to avoid locking in 2190 * the usual case. Setting td_priority here is essentially an 2191 * incomplete workaround for not setting it properly elsewhere. 2192 * Now that some interrupt handlers are threads, not setting it 2193 * properly elsewhere can clobber it in the window between setting 2194 * it here and returning to user mode, so don't waste time setting 2195 * it perfectly here. 2196 */ 2197 KASSERT((td->td_flags & TDF_BORROWING) == 0, 2198 ("thread with borrowed priority returning to userland")); 2199 if (td->td_priority != td->td_user_pri) { 2200 thread_lock(td); 2201 td->td_priority = td->td_user_pri; 2202 td->td_base_pri = td->td_user_pri; 2203 thread_unlock(td); 2204 } 2205 } 2206 2207 /* 2208 * Handle a stathz tick. This is really only relevant for timeshare 2209 * threads. 2210 */ 2211 void 2212 sched_clock(struct thread *td) 2213 { 2214 struct tdq *tdq; 2215 struct td_sched *ts; 2216 2217 THREAD_LOCK_ASSERT(td, MA_OWNED); 2218 tdq = TDQ_SELF(); 2219 #ifdef SMP 2220 /* 2221 * We run the long term load balancer infrequently on the first cpu. 2222 */ 2223 if (balance_tdq == tdq) { 2224 if (balance_ticks && --balance_ticks == 0) 2225 sched_balance(); 2226 if (balance_group_ticks && --balance_group_ticks == 0) 2227 sched_balance_groups(); 2228 } 2229 #endif 2230 /* 2231 * Advance the insert index once for each tick to ensure that all 2232 * threads get a chance to run. 2233 */ 2234 if (tdq->tdq_idx == tdq->tdq_ridx) { 2235 tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; 2236 if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) 2237 tdq->tdq_ridx = tdq->tdq_idx; 2238 } 2239 ts = td->td_sched; 2240 if (td->td_pri_class & PRI_FIFO_BIT) 2241 return; 2242 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { 2243 /* 2244 * We used a tick; charge it to the thread so 2245 * that we can compute our interactivity. 2246 */ 2247 td->td_sched->ts_runtime += tickincr; 2248 sched_interact_update(td); 2249 } 2250 /* 2251 * We used up one time slice. 2252 */ 2253 if (--ts->ts_slice > 0) 2254 return; 2255 /* 2256 * We're out of time, recompute priorities and requeue. 2257 */ 2258 sched_priority(td); 2259 td->td_flags |= TDF_NEEDRESCHED; 2260 } 2261 2262 /* 2263 * Called once per hz tick. Used for cpu utilization information. This 2264 * is easier than trying to scale based on stathz. 2265 */ 2266 void 2267 sched_tick(void) 2268 { 2269 struct td_sched *ts; 2270 2271 ts = curthread->td_sched; 2272 /* 2273 * Ticks is updated asynchronously on a single cpu. Check here to 2274 * avoid incrementing ts_ticks multiple times in a single tick. 2275 */ 2276 if (ts->ts_incrtick == ticks) 2277 return; 2278 /* Adjust ticks for pctcpu */ 2279 ts->ts_ticks += 1 << SCHED_TICK_SHIFT; 2280 ts->ts_incrtick = ticks; 2281 ts->ts_ltick = ticks; 2282 /* 2283 * Update if we've exceeded our desired tick threshhold by over one 2284 * second. 2285 */ 2286 if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick) 2287 sched_pctcpu_update(ts); 2288 } 2289 2290 /* 2291 * Return whether the current CPU has runnable tasks. Used for in-kernel 2292 * cooperative idle threads. 2293 */ 2294 int 2295 sched_runnable(void) 2296 { 2297 struct tdq *tdq; 2298 int load; 2299 2300 load = 1; 2301 2302 tdq = TDQ_SELF(); 2303 if ((curthread->td_flags & TDF_IDLETD) != 0) { 2304 if (tdq->tdq_load > 0) 2305 goto out; 2306 } else 2307 if (tdq->tdq_load - 1 > 0) 2308 goto out; 2309 load = 0; 2310 out: 2311 return (load); 2312 } 2313 2314 /* 2315 * Choose the highest priority thread to run. The thread is removed from 2316 * the run-queue while running however the load remains. For SMP we set 2317 * the tdq in the global idle bitmask if it idles here. 2318 */ 2319 struct thread * 2320 sched_choose(void) 2321 { 2322 #ifdef SMP 2323 struct tdq_group *tdg; 2324 #endif 2325 struct td_sched *ts; 2326 struct tdq *tdq; 2327 2328 tdq = TDQ_SELF(); 2329 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2330 ts = tdq_choose(tdq); 2331 if (ts) { 2332 tdq_runq_rem(tdq, ts); 2333 return (ts->ts_thread); 2334 } 2335 #ifdef SMP 2336 /* 2337 * We only set the idled bit when all of the cpus in the group are 2338 * idle. Otherwise we could get into a situation where a thread bounces 2339 * back and forth between two idle cores on seperate physical CPUs. 2340 */ 2341 tdg = tdq->tdq_group; 2342 tdg->tdg_idlemask |= PCPU_GET(cpumask); 2343 if (tdg->tdg_idlemask == tdg->tdg_cpumask) 2344 atomic_set_int(&tdq_idle, tdg->tdg_mask); 2345 tdq->tdq_lowpri = PRI_MAX_IDLE; 2346 #endif 2347 return (PCPU_GET(idlethread)); 2348 } 2349 2350 /* 2351 * Set owepreempt if necessary. Preemption never happens directly in ULE, 2352 * we always request it once we exit a critical section. 2353 */ 2354 static inline void 2355 sched_setpreempt(struct thread *td) 2356 { 2357 struct thread *ctd; 2358 int cpri; 2359 int pri; 2360 2361 ctd = curthread; 2362 pri = td->td_priority; 2363 cpri = ctd->td_priority; 2364 if (td->td_priority < ctd->td_priority) 2365 curthread->td_flags |= TDF_NEEDRESCHED; 2366 if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) 2367 return; 2368 /* 2369 * Always preempt IDLE threads. Otherwise only if the preempting 2370 * thread is an ithread. 2371 */ 2372 if (pri > preempt_thresh && cpri < PRI_MIN_IDLE) 2373 return; 2374 ctd->td_owepreempt = 1; 2375 return; 2376 } 2377 2378 /* 2379 * Add a thread to a thread queue. Initializes priority, slice, runq, and 2380 * add it to the appropriate queue. This is the internal function called 2381 * when the tdq is predetermined. 2382 */ 2383 void 2384 tdq_add(struct tdq *tdq, struct thread *td, int flags) 2385 { 2386 struct td_sched *ts; 2387 int class; 2388 #ifdef SMP 2389 int cpumask; 2390 #endif 2391 2392 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2393 KASSERT((td->td_inhibitors == 0), 2394 ("sched_add: trying to run inhibited thread")); 2395 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 2396 ("sched_add: bad thread state")); 2397 KASSERT(td->td_flags & TDF_INMEM, 2398 ("sched_add: thread swapped out")); 2399 2400 ts = td->td_sched; 2401 class = PRI_BASE(td->td_pri_class); 2402 TD_SET_RUNQ(td); 2403 if (ts->ts_slice == 0) 2404 ts->ts_slice = sched_slice; 2405 /* 2406 * Pick the run queue based on priority. 2407 */ 2408 if (td->td_priority < PRI_MIN_BATCH) 2409 ts->ts_runq = &tdq->tdq_realtime; 2410 else if (td->td_priority <= PRI_MAX_BATCH) 2411 ts->ts_runq = &tdq->tdq_timeshare; 2412 else 2413 ts->ts_runq = &tdq->tdq_idle; 2414 #ifdef SMP 2415 cpumask = 1 << ts->ts_cpu; 2416 /* 2417 * If we had been idle, clear our bit in the group and potentially 2418 * the global bitmap. 2419 */ 2420 if ((class != PRI_IDLE && class != PRI_ITHD) && 2421 (tdq->tdq_group->tdg_idlemask & cpumask) != 0) { 2422 /* 2423 * Check to see if our group is unidling, and if so, remove it 2424 * from the global idle mask. 2425 */ 2426 if (tdq->tdq_group->tdg_idlemask == 2427 tdq->tdq_group->tdg_cpumask) 2428 atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); 2429 /* 2430 * Now remove ourselves from the group specific idle mask. 2431 */ 2432 tdq->tdq_group->tdg_idlemask &= ~cpumask; 2433 } 2434 if (td->td_priority < tdq->tdq_lowpri) 2435 tdq->tdq_lowpri = td->td_priority; 2436 #endif 2437 tdq_runq_add(tdq, ts, flags); 2438 tdq_load_add(tdq, ts); 2439 } 2440 2441 /* 2442 * Select the target thread queue and add a thread to it. Request 2443 * preemption or IPI a remote processor if required. 2444 */ 2445 void 2446 sched_add(struct thread *td, int flags) 2447 { 2448 struct td_sched *ts; 2449 struct tdq *tdq; 2450 #ifdef SMP 2451 int cpuid; 2452 int cpu; 2453 #endif 2454 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 2455 td, td->td_proc->p_comm, td->td_priority, curthread, 2456 curthread->td_proc->p_comm); 2457 THREAD_LOCK_ASSERT(td, MA_OWNED); 2458 ts = td->td_sched; 2459 /* 2460 * Recalculate the priority before we select the target cpu or 2461 * run-queue. 2462 */ 2463 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) 2464 sched_priority(td); 2465 #ifdef SMP 2466 cpuid = PCPU_GET(cpuid); 2467 /* 2468 * Pick the destination cpu and if it isn't ours transfer to the 2469 * target cpu. 2470 */ 2471 if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td) && 2472 curthread->td_intr_nesting_level) 2473 ts->ts_cpu = cpuid; 2474 if (!THREAD_CAN_MIGRATE(td)) 2475 cpu = ts->ts_cpu; 2476 else 2477 cpu = sched_pickcpu(td, flags); 2478 tdq = sched_setcpu(ts, cpu, flags); 2479 tdq_add(tdq, td, flags); 2480 if (cpu != cpuid) { 2481 tdq_notify(ts); 2482 return; 2483 } 2484 #else 2485 tdq = TDQ_SELF(); 2486 TDQ_LOCK(tdq); 2487 /* 2488 * Now that the thread is moving to the run-queue, set the lock 2489 * to the scheduler's lock. 2490 */ 2491 thread_lock_set(td, TDQ_LOCKPTR(tdq)); 2492 tdq_add(tdq, td, flags); 2493 #endif 2494 if (!(flags & SRQ_YIELDING)) 2495 sched_setpreempt(td); 2496 } 2497 2498 /* 2499 * Remove a thread from a run-queue without running it. This is used 2500 * when we're stealing a thread from a remote queue. Otherwise all threads 2501 * exit by calling sched_exit_thread() and sched_throw() themselves. 2502 */ 2503 void 2504 sched_rem(struct thread *td) 2505 { 2506 struct tdq *tdq; 2507 struct td_sched *ts; 2508 2509 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 2510 td, td->td_proc->p_comm, td->td_priority, curthread, 2511 curthread->td_proc->p_comm); 2512 ts = td->td_sched; 2513 tdq = TDQ_CPU(ts->ts_cpu); 2514 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2515 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2516 KASSERT(TD_ON_RUNQ(td), 2517 ("sched_rem: thread not on run queue")); 2518 tdq_runq_rem(tdq, ts); 2519 tdq_load_rem(tdq, ts); 2520 TD_SET_CAN_RUN(td); 2521 } 2522 2523 /* 2524 * Fetch cpu utilization information. Updates on demand. 2525 */ 2526 fixpt_t 2527 sched_pctcpu(struct thread *td) 2528 { 2529 fixpt_t pctcpu; 2530 struct td_sched *ts; 2531 2532 pctcpu = 0; 2533 ts = td->td_sched; 2534 if (ts == NULL) 2535 return (0); 2536 2537 THREAD_LOCK_ASSERT(td, MA_OWNED); 2538 if (ts->ts_ticks) { 2539 int rtick; 2540 2541 sched_pctcpu_update(ts); 2542 /* How many rtick per second ? */ 2543 rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); 2544 pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; 2545 } 2546 2547 return (pctcpu); 2548 } 2549 2550 /* 2551 * Enforce affinity settings for a thread. Called after adjustments to 2552 * cpumask. 2553 */ 2554 void 2555 sched_affinity(struct thread *td) 2556 { 2557 #ifdef SMP 2558 struct td_sched *ts; 2559 2560 THREAD_LOCK_ASSERT(td, MA_OWNED); 2561 ts = td->td_sched; 2562 if (THREAD_CAN_SCHED(td, ts->ts_cpu)) 2563 return; 2564 if (TD_ON_RUNQ(td)) { 2565 sched_rem(td); 2566 sched_add(td, SRQ_BORING); 2567 return; 2568 } 2569 if (!TD_IS_RUNNING(td)) 2570 return; 2571 /* 2572 * Force a switch before returning to userspace. If the 2573 * target thread is not running locally send an ipi to force 2574 * the issue. 2575 */ 2576 td->td_flags |= TDF_NEEDRESCHED; 2577 if (td != curthread) 2578 ipi_selected(1 << ts->ts_cpu, IPI_PREEMPT); 2579 #endif 2580 } 2581 2582 /* 2583 * Bind a thread to a target cpu. 2584 */ 2585 void 2586 sched_bind(struct thread *td, int cpu) 2587 { 2588 struct td_sched *ts; 2589 2590 THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); 2591 KASSERT(td == curthread, ("sched_bind: can only bind curthread")); 2592 ts = td->td_sched; 2593 if (ts->ts_flags & TSF_BOUND) 2594 sched_unbind(td); 2595 KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); 2596 ts->ts_flags |= TSF_BOUND; 2597 #ifdef SMP 2598 sched_pin(); 2599 if (PCPU_GET(cpuid) == cpu) 2600 return; 2601 ts->ts_cpu = cpu; 2602 /* When we return from mi_switch we'll be on the correct cpu. */ 2603 mi_switch(SW_VOL, NULL); 2604 #endif 2605 } 2606 2607 /* 2608 * Release a bound thread. 2609 */ 2610 void 2611 sched_unbind(struct thread *td) 2612 { 2613 struct td_sched *ts; 2614 2615 THREAD_LOCK_ASSERT(td, MA_OWNED); 2616 KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); 2617 ts = td->td_sched; 2618 if ((ts->ts_flags & TSF_BOUND) == 0) 2619 return; 2620 ts->ts_flags &= ~TSF_BOUND; 2621 #ifdef SMP 2622 sched_unpin(); 2623 #endif 2624 } 2625 2626 int 2627 sched_is_bound(struct thread *td) 2628 { 2629 THREAD_LOCK_ASSERT(td, MA_OWNED); 2630 return (td->td_sched->ts_flags & TSF_BOUND); 2631 } 2632 2633 /* 2634 * Basic yield call. 2635 */ 2636 void 2637 sched_relinquish(struct thread *td) 2638 { 2639 thread_lock(td); 2640 SCHED_STAT_INC(switch_relinquish); 2641 mi_switch(SW_VOL, NULL); 2642 thread_unlock(td); 2643 } 2644 2645 /* 2646 * Return the total system load. 2647 */ 2648 int 2649 sched_load(void) 2650 { 2651 #ifdef SMP 2652 int total; 2653 int i; 2654 2655 total = 0; 2656 for (i = 0; i <= tdg_maxid; i++) 2657 total += TDQ_GROUP(i)->tdg_load; 2658 return (total); 2659 #else 2660 return (TDQ_SELF()->tdq_sysload); 2661 #endif 2662 } 2663 2664 int 2665 sched_sizeof_proc(void) 2666 { 2667 return (sizeof(struct proc)); 2668 } 2669 2670 int 2671 sched_sizeof_thread(void) 2672 { 2673 return (sizeof(struct thread) + sizeof(struct td_sched)); 2674 } 2675 2676 /* 2677 * The actual idle process. 2678 */ 2679 void 2680 sched_idletd(void *dummy) 2681 { 2682 struct thread *td; 2683 struct tdq *tdq; 2684 2685 td = curthread; 2686 tdq = TDQ_SELF(); 2687 mtx_assert(&Giant, MA_NOTOWNED); 2688 /* ULE relies on preemption for idle interruption. */ 2689 for (;;) { 2690 #ifdef SMP 2691 if (tdq_idled(tdq)) 2692 cpu_idle(); 2693 #else 2694 cpu_idle(); 2695 #endif 2696 } 2697 } 2698 2699 /* 2700 * A CPU is entering for the first time or a thread is exiting. 2701 */ 2702 void 2703 sched_throw(struct thread *td) 2704 { 2705 struct thread *newtd; 2706 struct tdq *tdq; 2707 2708 tdq = TDQ_SELF(); 2709 if (td == NULL) { 2710 /* Correct spinlock nesting and acquire the correct lock. */ 2711 TDQ_LOCK(tdq); 2712 spinlock_exit(); 2713 } else { 2714 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2715 tdq_load_rem(tdq, td->td_sched); 2716 } 2717 KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); 2718 newtd = choosethread(); 2719 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; 2720 PCPU_SET(switchtime, cpu_ticks()); 2721 PCPU_SET(switchticks, ticks); 2722 cpu_throw(td, newtd); /* doesn't return */ 2723 } 2724 2725 /* 2726 * This is called from fork_exit(). Just acquire the correct locks and 2727 * let fork do the rest of the work. 2728 */ 2729 void 2730 sched_fork_exit(struct thread *td) 2731 { 2732 struct td_sched *ts; 2733 struct tdq *tdq; 2734 int cpuid; 2735 2736 /* 2737 * Finish setting up thread glue so that it begins execution in a 2738 * non-nested critical section with the scheduler lock held. 2739 */ 2740 cpuid = PCPU_GET(cpuid); 2741 tdq = TDQ_CPU(cpuid); 2742 ts = td->td_sched; 2743 if (TD_IS_IDLETHREAD(td)) 2744 td->td_lock = TDQ_LOCKPTR(tdq); 2745 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2746 td->td_oncpu = cpuid; 2747 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 2748 } 2749 2750 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, 2751 "Scheduler"); 2752 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, 2753 "Scheduler name"); 2754 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, 2755 "Slice size for timeshare threads"); 2756 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, 2757 "Interactivity score threshold"); 2758 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 2759 0,"Min priority for preemption, lower priorities have greater precedence"); 2760 #ifdef SMP 2761 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, 2762 "Pick the target cpu based on priority rather than load."); 2763 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, 2764 "Number of hz ticks to keep thread affinity for"); 2765 SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, ""); 2766 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, 2767 "Enables the long-term load balancer"); 2768 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, 2769 &balance_interval, 0, 2770 "Average frequency in stathz ticks to run the long-term balancer"); 2771 SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, 2772 "Steals work from another hyper-threaded core on idle"); 2773 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, 2774 "Attempts to steal work from other cores before idling"); 2775 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, 2776 "Minimum load on remote cpu before we'll steal"); 2777 SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, 2778 "True when a topology has been specified by the MD code."); 2779 #endif 2780 2781 /* ps compat. All cpu percentages from ULE are weighted. */ 2782 static int ccpu = 0; 2783 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 2784 2785 2786 #define KERN_SWITCH_INCLUDE 1 2787 #include "kern/kern_switch.c"

Cache object: c11c993a2643f81c7f91da92cd99510c

FreeBSD/Linux Kernel Cross Reference sys/kern/sched_ule.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/sched_ule.c