FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_runq.c
1 /* $NetBSD: kern_runq.c,v 1.69 2020/05/23 21:24:41 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 */
57
58 #include <sys/cdefs.h>
59 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.69 2020/05/23 21:24:41 ad Exp $");
60
61 #include "opt_dtrace.h"
62
63 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/bitops.h>
66 #include <sys/cpu.h>
67 #include <sys/idle.h>
68 #include <sys/intr.h>
69 #include <sys/kmem.h>
70 #include <sys/lwp.h>
71 #include <sys/mutex.h>
72 #include <sys/proc.h>
73 #include <sys/pset.h>
74 #include <sys/sched.h>
75 #include <sys/syscallargs.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/types.h>
79 #include <sys/evcnt.h>
80 #include <sys/atomic.h>
81
82 /*
83 * Bits per map.
84 */
85 #define BITMAP_BITS (32)
86 #define BITMAP_SHIFT (5)
87 #define BITMAP_MSB (0x80000000U)
88 #define BITMAP_MASK (BITMAP_BITS - 1)
89
90 const int schedppq = 1;
91
92 static void *sched_getrq(struct schedstate_percpu *, const pri_t);
93 #ifdef MULTIPROCESSOR
94 static lwp_t * sched_catchlwp(struct cpu_info *);
95 #endif
96
97 /*
98 * Preemption control.
99 */
100 #ifdef __HAVE_PREEMPTION
101 # ifdef DEBUG
102 int sched_kpreempt_pri = 0;
103 # else
104 int sched_kpreempt_pri = PRI_USER_RT;
105 # endif
106 #else
107 int sched_kpreempt_pri = 1000;
108 #endif
109
110 /*
111 * Migration and balancing.
112 */
113 static u_int cacheht_time; /* Cache hotness time */
114 static u_int min_catch; /* Minimal LWP count for catching */
115 static u_int skim_interval; /* Rate limit for stealing LWPs */
116
117 #ifdef KDTRACE_HOOKS
118 struct lwp *curthread;
119 #endif
120
121 void
122 runq_init(void)
123 {
124
125 /* Pulling from remote packages, LWP must not have run for 10ms. */
126 cacheht_time = 10;
127
128 /* Minimal count of LWPs for catching */
129 min_catch = 1;
130
131 /* Steal from other CPUs at most every 10ms. */
132 skim_interval = 10;
133 }
134
135 void
136 sched_cpuattach(struct cpu_info *ci)
137 {
138 struct schedstate_percpu *spc;
139 size_t size;
140 void *p;
141 u_int i;
142
143 spc = &ci->ci_schedstate;
144 spc->spc_nextpkg = ci;
145
146 if (spc->spc_lwplock == NULL) {
147 spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
148 }
149 if (ci == lwp0.l_cpu) {
150 /* Initialize the scheduler structure of the primary LWP */
151 lwp0.l_mutex = spc->spc_lwplock;
152 }
153 if (spc->spc_mutex != NULL) {
154 /* Already initialized. */
155 return;
156 }
157
158 /* Allocate the run queue */
159 size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
160 coherency_unit;
161 p = kmem_alloc(size, KM_SLEEP);
162 spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);
163
164 /* Initialize run queues */
165 spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
166 for (i = 0; i < PRI_COUNT; i++)
167 TAILQ_INIT(&spc->spc_queue[i]);
168 }
169
170 /*
171 * Control of the runqueue.
172 */
173 static inline void *
174 sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
175 {
176
177 KASSERT(prio < PRI_COUNT);
178 return &spc->spc_queue[prio];
179 }
180
181 /*
182 * Put an LWP onto a run queue. The LWP must be locked by spc_mutex for
183 * l_cpu.
184 */
185 void
186 sched_enqueue(struct lwp *l)
187 {
188 struct schedstate_percpu *spc;
189 TAILQ_HEAD(, lwp) *q_head;
190 const pri_t eprio = lwp_eprio(l);
191 struct cpu_info *ci;
192
193 ci = l->l_cpu;
194 spc = &ci->ci_schedstate;
195 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
196
197 /* Enqueue the thread */
198 q_head = sched_getrq(spc, eprio);
199 if (TAILQ_EMPTY(q_head)) {
200 u_int i;
201 uint32_t q;
202
203 /* Mark bit */
204 i = eprio >> BITMAP_SHIFT;
205 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
206 KASSERT((spc->spc_bitmap[i] & q) == 0);
207 spc->spc_bitmap[i] |= q;
208 }
209
210 /*
211 * Determine run queue position according to POSIX. XXX Explicitly
212 * lowering a thread's priority with pthread_setschedparam() is not
213 * handled.
214 */
215 if ((l->l_pflag & LP_PREEMPTING) != 0) {
216 switch (l->l_class) {
217 case SCHED_OTHER:
218 TAILQ_INSERT_TAIL(q_head, l, l_runq);
219 break;
220 case SCHED_FIFO:
221 TAILQ_INSERT_HEAD(q_head, l, l_runq);
222 break;
223 case SCHED_RR:
224 if (getticks() - l->l_rticks >= sched_rrticks) {
225 TAILQ_INSERT_TAIL(q_head, l, l_runq);
226 } else {
227 TAILQ_INSERT_HEAD(q_head, l, l_runq);
228 }
229 break;
230 default: /* SCHED_OTHER */
231 panic("sched_enqueue: LWP %p has class %d\n",
232 l, l->l_class);
233 }
234 } else {
235 TAILQ_INSERT_TAIL(q_head, l, l_runq);
236 }
237 spc->spc_flags &= ~SPCF_IDLE;
238 spc->spc_count++;
239 if ((l->l_pflag & LP_BOUND) == 0) {
240 atomic_store_relaxed(&spc->spc_mcount,
241 atomic_load_relaxed(&spc->spc_mcount) + 1);
242 }
243
244 /*
245 * Update the value of highest priority in the runqueue,
246 * if priority of this thread is higher.
247 */
248 if (eprio > spc->spc_maxpriority)
249 spc->spc_maxpriority = eprio;
250
251 sched_newts(l);
252 }
253
254 /*
255 * Remove and LWP from the run queue it's on. The LWP must be in state
256 * LSRUN.
257 */
258 void
259 sched_dequeue(struct lwp *l)
260 {
261 TAILQ_HEAD(, lwp) *q_head;
262 struct schedstate_percpu *spc;
263 const pri_t eprio = lwp_eprio(l);
264
265 spc = &l->l_cpu->ci_schedstate;
266
267 KASSERT(lwp_locked(l, spc->spc_mutex));
268 KASSERT(eprio <= spc->spc_maxpriority);
269 KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0);
270 KASSERT(spc->spc_count > 0);
271
272 if (spc->spc_migrating == l)
273 spc->spc_migrating = NULL;
274
275 spc->spc_count--;
276 if ((l->l_pflag & LP_BOUND) == 0) {
277 atomic_store_relaxed(&spc->spc_mcount,
278 atomic_load_relaxed(&spc->spc_mcount) - 1);
279 }
280
281 q_head = sched_getrq(spc, eprio);
282 TAILQ_REMOVE(q_head, l, l_runq);
283 if (TAILQ_EMPTY(q_head)) {
284 u_int i;
285 uint32_t q;
286
287 /* Unmark bit */
288 i = eprio >> BITMAP_SHIFT;
289 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
290 KASSERT((spc->spc_bitmap[i] & q) != 0);
291 spc->spc_bitmap[i] &= ~q;
292
293 /*
294 * Update the value of highest priority in the runqueue, in a
295 * case it was a last thread in the queue of highest priority.
296 */
297 if (eprio != spc->spc_maxpriority)
298 return;
299
300 do {
301 if (spc->spc_bitmap[i] != 0) {
302 q = ffs(spc->spc_bitmap[i]);
303 spc->spc_maxpriority =
304 (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
305 return;
306 }
307 } while (i--);
308
309 /* If not found - set the lowest value */
310 spc->spc_maxpriority = 0;
311 }
312 }
313
314 /*
315 * Cause a preemption on the given CPU, if the priority "pri" is higher
316 * priority than the running LWP. If "unlock" is specified, and ideally it
317 * will be for concurrency reasons, spc_mutex will be dropped before return.
318 */
319 void
320 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
321 {
322 struct schedstate_percpu *spc;
323 u_int o, n, f;
324 lwp_t *l;
325
326 spc = &ci->ci_schedstate;
327
328 KASSERT(mutex_owned(spc->spc_mutex));
329
330 /*
331 * If the priority level we're evaluating wouldn't cause a new LWP
332 * to be run on the CPU, then we have nothing to do.
333 */
334 if (pri <= spc->spc_curpriority || !mp_online) {
335 if (__predict_true(unlock)) {
336 spc_unlock(ci);
337 }
338 return;
339 }
340
341 /*
342 * Figure out what kind of preemption we should do.
343 */
344 l = ci->ci_onproc;
345 if ((l->l_flag & LW_IDLE) != 0) {
346 f = RESCHED_IDLE | RESCHED_UPREEMPT;
347 } else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) {
348 /* We can't currently preempt softints - should be able to. */
349 #ifdef __HAVE_PREEMPTION
350 f = RESCHED_KPREEMPT;
351 #else
352 /* Leave door open for test: set kpreempt_pri with sysctl. */
353 f = RESCHED_UPREEMPT;
354 #endif
355 /*
356 * l_dopreempt must be set with the CPU locked to sync with
357 * mi_switch(). It must also be set with an atomic to sync
358 * with kpreempt().
359 */
360 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
361 } else {
362 f = RESCHED_UPREEMPT;
363 }
364 if (ci != curcpu()) {
365 f |= RESCHED_REMOTE;
366 }
367
368 /*
369 * Things can start as soon as ci_want_resched is touched: x86 has
370 * an instruction that monitors the memory cell it's in. Drop the
371 * schedstate lock in advance, otherwise the remote CPU can awaken
372 * and immediately block on the lock.
373 */
374 if (__predict_true(unlock)) {
375 spc_unlock(ci);
376 }
377
378 /*
379 * The caller almost always has a second scheduler lock held: either
380 * the running LWP lock (spc_lwplock), or a sleep queue lock. That
381 * keeps preemption disabled, which among other things ensures all
382 * LWPs involved won't be freed while we're here (see lwp_dtor()).
383 */
384 KASSERT(kpreempt_disabled());
385
386 for (o = 0;; o = n) {
387 n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
388 if (__predict_true(o == n)) {
389 /*
390 * We're the first to set a resched on the CPU. Try
391 * to avoid causing a needless trip through trap()
392 * to handle an AST fault, if it's known the LWP
393 * will either block or go through userret() soon.
394 */
395 if (l != curlwp || cpu_intr_p()) {
396 cpu_need_resched(ci, l, f);
397 }
398 break;
399 }
400 if (__predict_true(
401 (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
402 (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
403 /* Already in progress, nothing to do. */
404 break;
405 }
406 }
407 }
408
409 /*
410 * Cause a preemption on the given CPU, if the priority of LWP "l" in state
411 * LSRUN, is higher priority than the running LWP. If "unlock" is
412 * specified, and ideally it will be for concurrency reasons, spc_mutex will
413 * be dropped before return.
414 */
415 void
416 sched_resched_lwp(struct lwp *l, bool unlock)
417 {
418 struct cpu_info *ci = l->l_cpu;
419
420 KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex));
421 KASSERT(l->l_stat == LSRUN);
422
423 sched_resched_cpu(ci, lwp_eprio(l), unlock);
424 }
425
426 /*
427 * Migration and balancing.
428 */
429
430 #ifdef MULTIPROCESSOR
431
432 /*
433 * Estimate if LWP is cache-hot.
434 */
435 static inline bool
436 lwp_cache_hot(const struct lwp *l)
437 {
438
439 /* Leave new LWPs in peace, determination has already been made. */
440 if (l->l_stat == LSIDL)
441 return true;
442
443 if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
444 return false;
445
446 return (getticks() - l->l_rticks < mstohz(cacheht_time));
447 }
448
449 /*
450 * Check if LWP can migrate to the chosen CPU.
451 */
452 static inline bool
453 sched_migratable(const struct lwp *l, struct cpu_info *ci)
454 {
455 const struct schedstate_percpu *spc = &ci->ci_schedstate;
456 KASSERT(lwp_locked(__UNCONST(l), NULL));
457
458 /* Is CPU offline? */
459 if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
460 return false;
461
462 /* Is affinity set? */
463 if (__predict_false(l->l_affinity))
464 return kcpuset_isset(l->l_affinity, cpu_index(ci));
465
466 /* Is there a processor-set? */
467 return (spc->spc_psid == l->l_psid);
468 }
469
470 /*
471 * A small helper to do round robin through CPU packages.
472 */
473 static struct cpu_info *
474 sched_nextpkg(void)
475 {
476 struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
477
478 spc->spc_nextpkg =
479 spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];
480
481 return spc->spc_nextpkg;
482 }
483
484 /*
485 * Find a CPU to run LWP "l". Look for the CPU with the lowest priority
486 * thread. In case of equal priority, prefer first class CPUs, and amongst
487 * the remainder choose the CPU with the fewest runqueue entries.
488 *
489 * Begin the search in the CPU package which "pivot" is a member of.
490 */
491 static struct cpu_info * __noinline
492 sched_bestcpu(struct lwp *l, struct cpu_info *pivot)
493 {
494 struct cpu_info *bestci, *curci, *outer;
495 struct schedstate_percpu *bestspc, *curspc;
496 pri_t bestpri, curpri;
497
498 /*
499 * If this fails (it shouldn't), run on the given CPU. This also
500 * gives us a weak preference for "pivot" to begin with.
501 */
502 bestci = pivot;
503 bestspc = &bestci->ci_schedstate;
504 if (sched_migratable(l, bestci)) {
505 bestpri = MAX(bestspc->spc_curpriority,
506 bestspc->spc_maxpriority);
507 } else {
508 /* Invalidate the priority. */
509 bestpri = PRI_COUNT;
510 }
511
512 /* In the outer loop scroll through all CPU packages. */
513 pivot = pivot->ci_package1st;
514 outer = pivot;
515 do {
516 /* In the inner loop scroll through all CPUs in package. */
517 curci = outer;
518 do {
519 if (!sched_migratable(l, curci)) {
520 continue;
521 }
522
523 curspc = &curci->ci_schedstate;
524
525 /* If this CPU is idle and 1st class, we're done. */
526 if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) ==
527 (SPCF_IDLE | SPCF_1STCLASS)) {
528 return curci;
529 }
530
531 curpri = MAX(curspc->spc_curpriority,
532 curspc->spc_maxpriority);
533
534 if (curpri > bestpri) {
535 continue;
536 }
537 if (curpri == bestpri) {
538 /* Prefer first class CPUs over others. */
539 if ((curspc->spc_flags & SPCF_1STCLASS) == 0 &&
540 (bestspc->spc_flags & SPCF_1STCLASS) != 0) {
541 continue;
542 }
543 /*
544 * Pick the least busy CPU. Make sure this is not
545 * <=, otherwise it defeats the above preference.
546 */
547 if (bestspc->spc_count < curspc->spc_count) {
548 continue;
549 }
550 }
551
552 bestpri = curpri;
553 bestci = curci;
554 bestspc = curspc;
555
556 } while (curci = curci->ci_sibling[CPUREL_PACKAGE],
557 curci != outer);
558 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
559 outer != pivot);
560
561 return bestci;
562 }
563
564 /*
565 * Estimate the migration of LWP to the other CPU.
566 * Take and return the CPU, if migration is needed.
567 */
568 struct cpu_info *
569 sched_takecpu(struct lwp *l)
570 {
571 struct schedstate_percpu *spc, *tspc;
572 struct cpu_info *ci, *curci, *tci;
573 pri_t eprio;
574 int flags;
575
576 KASSERT(lwp_locked(l, NULL));
577
578 /* If thread is strictly bound, do not estimate other CPUs */
579 ci = l->l_cpu;
580 if (l->l_pflag & LP_BOUND)
581 return ci;
582
583 spc = &ci->ci_schedstate;
584 eprio = lwp_eprio(l);
585
586 /*
587 * Handle new LWPs. For vfork() with a timeshared child, make it
588 * run on the same CPU as the parent if no other LWPs in queue.
589 * Otherwise scatter far and wide - try for an even distribution
590 * across all CPU packages and CPUs.
591 */
592 if (l->l_stat == LSIDL) {
593 if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) {
594 if (sched_migratable(l, curlwp->l_cpu) && eprio >
595 curlwp->l_cpu->ci_schedstate.spc_maxpriority) {
596 return curlwp->l_cpu;
597 }
598 } else {
599 return sched_bestcpu(l, sched_nextpkg());
600 }
601 flags = SPCF_IDLE;
602 } else {
603 flags = SPCF_IDLE | SPCF_1STCLASS;
604 }
605
606 /*
607 * Try to send the LWP back to the first CPU in the same core if
608 * idle. This keeps LWPs clustered in the run queues of 1st class
609 * CPUs. This implies stickiness. If we didn't find a home for
610 * a vfork() child above, try to use any SMT sibling to help out.
611 */
612 tci = ci;
613 do {
614 tspc = &tci->ci_schedstate;
615 if ((tspc->spc_flags & flags) == flags &&
616 sched_migratable(l, tci)) {
617 return tci;
618 }
619 tci = tci->ci_sibling[CPUREL_CORE];
620 } while (tci != ci);
621
622 /*
623 * Otherwise the LWP is "sticky", i.e. generally preferring to stay
624 * on the same CPU.
625 */
626 if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority ||
627 (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) {
628 return ci;
629 }
630
631 /*
632 * If the current CPU core is idle, run there and avoid the
633 * expensive scan of CPUs below.
634 */
635 curci = curcpu();
636 tci = curci;
637 do {
638 tspc = &tci->ci_schedstate;
639 if ((tspc->spc_flags & flags) == flags &&
640 sched_migratable(l, tci)) {
641 return tci;
642 }
643 tci = tci->ci_sibling[CPUREL_CORE];
644 } while (tci != curci);
645
646 /*
647 * Didn't find a new home above - happens infrequently. Start the
648 * search in last CPU package that the LWP ran in, but expand to
649 * include the whole system if needed.
650 */
651 return sched_bestcpu(l, l->l_cpu);
652 }
653
654 /*
655 * Tries to catch an LWP from the runqueue of other CPU.
656 */
657 static struct lwp *
658 sched_catchlwp(struct cpu_info *ci)
659 {
660 struct cpu_info *curci = curcpu();
661 struct schedstate_percpu *spc, *curspc;
662 TAILQ_HEAD(, lwp) *q_head;
663 struct lwp *l;
664 bool gentle;
665
666 curspc = &curci->ci_schedstate;
667 spc = &ci->ci_schedstate;
668
669 /*
670 * Be more aggressive if this CPU is first class, and the other
671 * is not.
672 */
673 gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 ||
674 (spc->spc_flags & SPCF_1STCLASS) != 0);
675
676 if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) ||
677 curspc->spc_psid != spc->spc_psid) {
678 spc_unlock(ci);
679 return NULL;
680 }
681
682 /* Take the highest priority thread */
683 q_head = sched_getrq(spc, spc->spc_maxpriority);
684 l = TAILQ_FIRST(q_head);
685
686 for (;;) {
687 /* Check the first and next result from the queue */
688 if (l == NULL) {
689 break;
690 }
691 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
692 ci->ci_data.cpu_name,
693 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
694
695 /* Look for threads, whose are allowed to migrate */
696 if ((l->l_pflag & LP_BOUND) ||
697 (gentle && lwp_cache_hot(l)) ||
698 !sched_migratable(l, curci)) {
699 l = TAILQ_NEXT(l, l_runq);
700 /* XXX Gap: could walk down priority list. */
701 continue;
702 }
703
704 /* Grab the thread, and move to the local run queue */
705 sched_dequeue(l);
706 l->l_cpu = curci;
707 lwp_unlock_to(l, curspc->spc_mutex);
708 sched_enqueue(l);
709 return l;
710 }
711 spc_unlock(ci);
712
713 return l;
714 }
715
716 /*
717 * Called from sched_idle() to handle migration. Return the CPU that we
718 * pushed the LWP to (may be NULL).
719 */
720 static struct cpu_info *
721 sched_idle_migrate(void)
722 {
723 struct cpu_info *ci = curcpu(), *tci = NULL;
724 struct schedstate_percpu *spc, *tspc;
725 bool dlock = false;
726
727 spc = &ci->ci_schedstate;
728 spc_lock(ci);
729 for (;;) {
730 struct lwp *l;
731
732 l = spc->spc_migrating;
733 if (l == NULL)
734 break;
735
736 /*
737 * If second attempt, and target CPU has changed,
738 * drop the old lock.
739 */
740 if (dlock == true && tci != l->l_target_cpu) {
741 KASSERT(tci != NULL);
742 spc_unlock(tci);
743 dlock = false;
744 }
745
746 /*
747 * Nothing to do if destination has changed to the
748 * local CPU, or migration was done by other CPU.
749 */
750 tci = l->l_target_cpu;
751 if (tci == NULL || tci == ci) {
752 spc->spc_migrating = NULL;
753 l->l_target_cpu = NULL;
754 break;
755 }
756 tspc = &tci->ci_schedstate;
757
758 /*
759 * Double-lock the runqueues.
760 * We do that only once.
761 */
762 if (dlock == false) {
763 dlock = true;
764 if (ci < tci) {
765 spc_lock(tci);
766 } else if (!mutex_tryenter(tspc->spc_mutex)) {
767 spc_unlock(ci);
768 spc_lock(tci);
769 spc_lock(ci);
770 /* Check the situation again.. */
771 continue;
772 }
773 }
774
775 /* Migrate the thread */
776 KASSERT(l->l_stat == LSRUN);
777 spc->spc_migrating = NULL;
778 l->l_target_cpu = NULL;
779 sched_dequeue(l);
780 l->l_cpu = tci;
781 lwp_setlock(l, tspc->spc_mutex);
782 sched_enqueue(l);
783 sched_resched_lwp(l, true);
784 /* tci now unlocked */
785 spc_unlock(ci);
786 return tci;
787 }
788 if (dlock == true) {
789 KASSERT(tci != NULL);
790 spc_unlock(tci);
791 }
792 spc_unlock(ci);
793 return NULL;
794 }
795
796 /*
797 * Try to steal an LWP from "tci".
798 */
799 static bool
800 sched_steal(struct cpu_info *ci, struct cpu_info *tci)
801 {
802 struct schedstate_percpu *spc, *tspc;
803 lwp_t *l;
804
805 spc = &ci->ci_schedstate;
806 tspc = &tci->ci_schedstate;
807 if (atomic_load_relaxed(&tspc->spc_mcount) != 0 &&
808 spc->spc_psid == tspc->spc_psid) {
809 spc_dlock(ci, tci);
810 l = sched_catchlwp(tci);
811 spc_unlock(ci);
812 if (l != NULL) {
813 return true;
814 }
815 }
816 return false;
817 }
818
819 /*
820 * Called from each CPU's idle loop.
821 */
822 void
823 sched_idle(void)
824 {
825 struct cpu_info *ci, *inner, *outer, *first, *tci, *mci;
826 struct schedstate_percpu *spc, *tspc;
827 struct lwp *l;
828
829 ci = curcpu();
830 spc = &ci->ci_schedstate;
831 tci = NULL;
832 mci = NULL;
833
834 /*
835 * Handle LWP migrations off this CPU to another. If there a is
836 * migration to do then remember the CPU the LWP was sent to, and
837 * don't steal the LWP back from that CPU below.
838 */
839 if (spc->spc_migrating != NULL) {
840 mci = sched_idle_migrate();
841 }
842
843 /* If this CPU is offline, or we have an LWP to run, we're done. */
844 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
845 return;
846 }
847
848 /* Deal with SMT. */
849 if (ci->ci_nsibling[CPUREL_CORE] > 1) {
850 /* Try to help our siblings out. */
851 tci = ci->ci_sibling[CPUREL_CORE];
852 while (tci != ci) {
853 if (tci != mci && sched_steal(ci, tci)) {
854 return;
855 }
856 tci = tci->ci_sibling[CPUREL_CORE];
857 }
858 /*
859 * If not the first SMT in the core, and in the default
860 * processor set, the search ends here.
861 */
862 if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
863 spc->spc_psid == PS_NONE) {
864 return;
865 }
866 }
867
868 /*
869 * Find something to run, unless this CPU exceeded the rate limit.
870 * Start looking on the current package to maximise L2/L3 cache
871 * locality. Then expand to looking at the rest of the system.
872 *
873 * XXX Should probably look at 2nd class CPUs first, but they will
874 * shed jobs via preempt() anyway.
875 */
876 if (spc->spc_nextskim > getticks()) {
877 return;
878 }
879 spc->spc_nextskim = getticks() + mstohz(skim_interval);
880
881 /* In the outer loop scroll through all CPU packages, starting here. */
882 first = ci->ci_package1st;
883 outer = first;
884 do {
885 /* In the inner loop scroll through all CPUs in package. */
886 inner = outer;
887 do {
888 /* Don't hit the locks unless needed. */
889 tspc = &inner->ci_schedstate;
890 if (ci == inner || ci == mci ||
891 spc->spc_psid != tspc->spc_psid ||
892 atomic_load_relaxed(&tspc->spc_mcount) < min_catch) {
893 continue;
894 }
895 spc_dlock(ci, inner);
896 l = sched_catchlwp(inner);
897 spc_unlock(ci);
898 if (l != NULL) {
899 /* Got it! */
900 return;
901 }
902 } while (inner = inner->ci_sibling[CPUREL_PACKAGE],
903 inner != outer);
904 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
905 outer != first);
906 }
907
908 /*
909 * Called from mi_switch() when an LWP has been preempted / has yielded.
910 * The LWP is presently in the CPU's run queue. Here we look for a better
911 * CPU to teleport the LWP to; there may not be one.
912 */
913 void
914 sched_preempted(struct lwp *l)
915 {
916 const int flags = SPCF_IDLE | SPCF_1STCLASS;
917 struct schedstate_percpu *tspc;
918 struct cpu_info *ci, *tci;
919
920 ci = l->l_cpu;
921 tspc = &ci->ci_schedstate;
922
923 KASSERT(tspc->spc_count >= 1);
924
925 /*
926 * Try to select another CPU if:
927 *
928 * - there is no migration pending already
929 * - and this LWP is running on a 2nd class CPU
930 * - or this LWP is a child of vfork() that has just done execve()
931 */
932 if (l->l_target_cpu != NULL ||
933 ((tspc->spc_flags & SPCF_1STCLASS) != 0 &&
934 (l->l_pflag & LP_TELEPORT) == 0)) {
935 return;
936 }
937
938 /*
939 * Fast path: if the first SMT in the core is idle, send it back
940 * there, because the cache is shared (cheap) and we want all LWPs
941 * to be clustered on 1st class CPUs (either running there or on
942 * their runqueues).
943 */
944 tci = ci->ci_sibling[CPUREL_CORE];
945 while (tci != ci) {
946 tspc = &tci->ci_schedstate;
947 if ((tspc->spc_flags & flags) == flags &&
948 sched_migratable(l, tci)) {
949 l->l_target_cpu = tci;
950 l->l_pflag &= ~LP_TELEPORT;
951 return;
952 }
953 tci = tci->ci_sibling[CPUREL_CORE];
954 }
955
956 if ((l->l_pflag & LP_TELEPORT) != 0) {
957 /*
958 * A child of vfork(): now that the parent is released,
959 * scatter far and wide, to match the LSIDL distribution
960 * done in sched_takecpu().
961 */
962 l->l_pflag &= ~LP_TELEPORT;
963 tci = sched_bestcpu(l, sched_nextpkg());
964 if (tci != ci) {
965 l->l_target_cpu = tci;
966 }
967 } else {
968 /*
969 * Try to find a better CPU to take it, but don't move to
970 * another 2nd class CPU, and don't move to a non-idle CPU,
971 * because that would prevent SMT being used to maximise
972 * throughput.
973 *
974 * Search in the current CPU package in order to try and
975 * keep L2/L3 cache locality, but expand to include the
976 * whole system if needed.
977 */
978 tci = sched_bestcpu(l, l->l_cpu);
979 if (tci != ci &&
980 (tci->ci_schedstate.spc_flags & flags) == flags) {
981 l->l_target_cpu = tci;
982 }
983 }
984 }
985
986 /*
987 * Called during execve() by a child of vfork(). Does two things:
988 *
989 * - If the parent has been awoken and put back on curcpu then give the
990 * CPU back to the parent.
991 *
992 * - If curlwp is not on a 1st class CPU then find somewhere else to run,
993 * since it dodged the distribution in sched_takecpu() when first set
994 * runnable.
995 */
996 void
997 sched_vforkexec(struct lwp *l, bool samecpu)
998 {
999
1000 KASSERT(l == curlwp);
1001 if ((samecpu && ncpu > 1) ||
1002 (l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) {
1003 l->l_pflag |= LP_TELEPORT;
1004 preempt();
1005 }
1006 }
1007
1008 #else
1009
1010 /*
1011 * stubs for !MULTIPROCESSOR
1012 */
1013
1014 struct cpu_info *
1015 sched_takecpu(struct lwp *l)
1016 {
1017
1018 return l->l_cpu;
1019 }
1020
1021 void
1022 sched_idle(void)
1023 {
1024
1025 }
1026
1027 void
1028 sched_preempted(struct lwp *l)
1029 {
1030
1031 }
1032
1033 void
1034 sched_vforkexec(struct lwp *l, bool samecpu)
1035 {
1036
1037 KASSERT(l == curlwp);
1038 }
1039
1040 #endif /* MULTIPROCESSOR */
1041
1042 /*
1043 * Scheduling statistics and balancing.
1044 */
1045 void
1046 sched_lwp_stats(struct lwp *l)
1047 {
1048 int batch;
1049
1050 KASSERT(lwp_locked(l, NULL));
1051
1052 /* Update sleep time */
1053 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
1054 l->l_stat == LSSUSPENDED)
1055 l->l_slptime++;
1056
1057 /*
1058 * Set that thread is more CPU-bound, if sum of run time exceeds the
1059 * sum of sleep time. Check if thread is CPU-bound a first time.
1060 */
1061 batch = (l->l_rticksum > l->l_slpticksum);
1062 if (batch != 0) {
1063 if ((l->l_flag & LW_BATCH) == 0)
1064 batch = 0;
1065 l->l_flag |= LW_BATCH;
1066 } else
1067 l->l_flag &= ~LW_BATCH;
1068
1069 /* Reset the time sums */
1070 l->l_slpticksum = 0;
1071 l->l_rticksum = 0;
1072
1073 /* Scheduler-specific hook */
1074 sched_pstats_hook(l, batch);
1075 #ifdef KDTRACE_HOOKS
1076 curthread = l;
1077 #endif
1078 }
1079
1080 /*
1081 * Scheduler mill.
1082 */
1083 struct lwp *
1084 sched_nextlwp(void)
1085 {
1086 struct cpu_info *ci = curcpu();
1087 struct schedstate_percpu *spc;
1088 TAILQ_HEAD(, lwp) *q_head;
1089 struct lwp *l;
1090
1091 /* Update the last run time on switch */
1092 l = curlwp;
1093 l->l_rticksum += (getticks() - l->l_rticks);
1094
1095 /* Return to idle LWP if there is a migrating thread */
1096 spc = &ci->ci_schedstate;
1097 if (__predict_false(spc->spc_migrating != NULL))
1098 return NULL;
1099
1100 /* Return to idle LWP if there is no runnable job */
1101 if (__predict_false(spc->spc_count == 0))
1102 return NULL;
1103
1104 /* Take the highest priority thread */
1105 KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
1106 q_head = sched_getrq(spc, spc->spc_maxpriority);
1107 l = TAILQ_FIRST(q_head);
1108 KASSERT(l != NULL);
1109
1110 sched_oncpu(l);
1111 l->l_rticks = getticks();
1112
1113 return l;
1114 }
1115
1116 /*
1117 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
1118 */
1119
1120 bool
1121 sched_curcpu_runnable_p(void)
1122 {
1123 const struct cpu_info *ci;
1124 const struct schedstate_percpu *spc;
1125 bool rv;
1126
1127 kpreempt_disable();
1128 ci = curcpu();
1129 spc = &ci->ci_schedstate;
1130 rv = (spc->spc_count != 0);
1131 #ifndef __HAVE_FAST_SOFTINTS
1132 rv |= (ci->ci_data.cpu_softints != 0);
1133 #endif
1134 kpreempt_enable();
1135
1136 return rv;
1137 }
1138
1139 /*
1140 * Sysctl nodes and initialization.
1141 */
1142
1143 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
1144 {
1145 const struct sysctlnode *node = NULL;
1146
1147 sysctl_createv(clog, 0, NULL, &node,
1148 CTLFLAG_PERMANENT,
1149 CTLTYPE_NODE, "sched",
1150 SYSCTL_DESCR("Scheduler options"),
1151 NULL, 0, NULL, 0,
1152 CTL_KERN, CTL_CREATE, CTL_EOL);
1153
1154 if (node == NULL)
1155 return;
1156
1157 sysctl_createv(clog, 0, &node, NULL,
1158 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1159 CTLTYPE_INT, "cacheht_time",
1160 SYSCTL_DESCR("Cache hotness time (in ms)"),
1161 NULL, 0, &cacheht_time, 0,
1162 CTL_CREATE, CTL_EOL);
1163 sysctl_createv(clog, 0, &node, NULL,
1164 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1165 CTLTYPE_INT, "skim_interval",
1166 SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"),
1167 NULL, 0, &skim_interval, 0,
1168 CTL_CREATE, CTL_EOL);
1169 sysctl_createv(clog, 0, &node, NULL,
1170 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1171 CTLTYPE_INT, "min_catch",
1172 SYSCTL_DESCR("Minimal count of threads for catching"),
1173 NULL, 0, &min_catch, 0,
1174 CTL_CREATE, CTL_EOL);
1175 sysctl_createv(clog, 0, &node, NULL,
1176 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1177 CTLTYPE_INT, "timesoftints",
1178 SYSCTL_DESCR("Track CPU time for soft interrupts"),
1179 NULL, 0, &softint_timing, 0,
1180 CTL_CREATE, CTL_EOL);
1181 sysctl_createv(clog, 0, &node, NULL,
1182 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1183 CTLTYPE_INT, "kpreempt_pri",
1184 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
1185 NULL, 0, &sched_kpreempt_pri, 0,
1186 CTL_CREATE, CTL_EOL);
1187 }
1188
1189 /*
1190 * Debugging.
1191 */
1192
1193 #ifdef DDB
1194
1195 void
1196 sched_print_runqueue(void (*pr)(const char *, ...))
1197 {
1198 struct cpu_info *ci, *tci;
1199 struct schedstate_percpu *spc;
1200 struct lwp *l;
1201 struct proc *p;
1202 CPU_INFO_ITERATOR cii;
1203
1204 for (CPU_INFO_FOREACH(cii, ci)) {
1205 int i;
1206
1207 spc = &ci->ci_schedstate;
1208
1209 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
1210 (*pr)(" pid.lid = %d.%d, r_count = %u, "
1211 "maxpri = %d, mlwp = %p\n",
1212 #ifdef MULTIPROCESSOR
1213 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
1214 #else
1215 curlwp->l_proc->p_pid, curlwp->l_lid,
1216 #endif
1217 spc->spc_count, spc->spc_maxpriority,
1218 spc->spc_migrating);
1219 i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
1220 do {
1221 uint32_t q;
1222 q = spc->spc_bitmap[i];
1223 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
1224 } while (i--);
1225 }
1226
1227 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
1228 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
1229
1230 PROCLIST_FOREACH(p, &allproc) {
1231 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
1232 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1233 ci = l->l_cpu;
1234 tci = l->l_target_cpu;
1235 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
1236 (int)l->l_lid, l->l_priority, lwp_eprio(l),
1237 l->l_flag, l->l_stat == LSRUN ? "RQ" :
1238 (l->l_stat == LSSLEEP ? "SQ" : "-"),
1239 l, ci->ci_index, (tci ? tci->ci_index : -1),
1240 (u_int)(getticks() - l->l_rticks));
1241 }
1242 }
1243 }
1244
1245 #endif
Cache object: b0f57c49022407f0eb81333a7819ae0b
|