FreeBSD/Linux Kernel Cross Reference
sys/kern/sched_ule.c
1 /*-
2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice unmodified, this list of conditions, and the following
10 * disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: releng/5.3/sys/kern/sched_ule.c 136957 2004-10-26 02:22:54Z scottl $");
29
30 #include <opt_sched.h>
31
32 #define kse td_sched
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kdb.h>
37 #include <sys/kernel.h>
38 #include <sys/ktr.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/resource.h>
43 #include <sys/resourcevar.h>
44 #include <sys/sched.h>
45 #include <sys/smp.h>
46 #include <sys/sx.h>
47 #include <sys/sysctl.h>
48 #include <sys/sysproto.h>
49 #include <sys/vmmeter.h>
50 #ifdef KTRACE
51 #include <sys/uio.h>
52 #include <sys/ktrace.h>
53 #endif
54
55 #include <machine/cpu.h>
56 #include <machine/smp.h>
57
58 #define KTR_ULE KTR_NFS
59
60 #error "The SCHED_ULE scheduler is broken. Please use SCHED_4BSD"
61
62 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
63 /* XXX This is bogus compatability crap for ps */
64 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
65 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
66
67 static void sched_setup(void *dummy);
68 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
69
70 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
71
72 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
73 "Scheduler name");
74
75 static int slice_min = 1;
76 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
77
78 static int slice_max = 10;
79 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
80
81 int realstathz;
82 int tickincr = 1;
83
84 #ifdef PREEMPTION
85 static void
86 printf_caddr_t(void *data)
87 {
88 printf("%s", (char *)data);
89 }
90 static char preempt_warning[] =
91 "WARNING: Kernel PREEMPTION is unstable under SCHED_ULE.\n";
92 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
93 preempt_warning)
94 #endif
95
96 /*
97 * The schedulable entity that can be given a context to run.
98 * A process may have several of these. Probably one per processor
99 * but posibly a few more. In this universe they are grouped
100 * with a KSEG that contains the priority and niceness
101 * for the group.
102 */
103 struct kse {
104 TAILQ_ENTRY(kse) ke_kglist; /* (*) Queue of threads in ke_ksegrp. */
105 TAILQ_ENTRY(kse) ke_kgrlist; /* (*) Queue of threads in this state.*/
106 TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */
107 int ke_flags; /* (j) KEF_* flags. */
108 struct thread *ke_thread; /* (*) Active associated thread. */
109 fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */
110 u_char ke_oncpu; /* (j) Which cpu we are on. */
111 char ke_rqindex; /* (j) Run queue index. */
112 enum {
113 KES_THREAD = 0x0, /* slaved to thread state */
114 KES_ONRUNQ
115 } ke_state; /* (j) thread sched specific status. */
116 int ke_slptime;
117 int ke_slice;
118 struct runq *ke_runq;
119 u_char ke_cpu; /* CPU that we have affinity for. */
120 /* The following variables are only used for pctcpu calculation */
121 int ke_ltick; /* Last tick that we were running on */
122 int ke_ftick; /* First tick that we were running on */
123 int ke_ticks; /* Tick count */
124
125 };
126
127
128 #define td_kse td_sched
129 #define td_slptime td_kse->ke_slptime
130 #define ke_proc ke_thread->td_proc
131 #define ke_ksegrp ke_thread->td_ksegrp
132
133 /* flags kept in ke_flags */
134 #define KEF_SCHED0 0x00001 /* For scheduler-specific use. */
135 #define KEF_SCHED1 0x00002 /* For scheduler-specific use. */
136 #define KEF_SCHED2 0x00004 /* For scheduler-specific use. */
137 #define KEF_SCHED3 0x00008 /* For scheduler-specific use. */
138 #define KEF_DIDRUN 0x02000 /* Thread actually ran. */
139 #define KEF_EXIT 0x04000 /* Thread is being killed. */
140
141 /*
142 * These datastructures are allocated within their parent datastructure but
143 * are scheduler specific.
144 */
145
146 #define ke_assign ke_procq.tqe_next
147
148 #define KEF_ASSIGNED KEF_SCHED0 /* Thread is being migrated. */
149 #define KEF_BOUND KEF_SCHED1 /* Thread can not migrate. */
150 #define KEF_XFERABLE KEF_SCHED2 /* Thread was added as transferable. */
151 #define KEF_HOLD KEF_SCHED3 /* Thread is temporarily bound. */
152
153 struct kg_sched {
154 struct thread *skg_last_assigned; /* (j) Last thread assigned to */
155 /* the system scheduler */
156 int skg_slptime; /* Number of ticks we vol. slept */
157 int skg_runtime; /* Number of ticks we were running */
158 int skg_avail_opennings; /* (j) Num unfilled slots in group.*/
159 int skg_concurrency; /* (j) Num threads requested in group.*/
160 int skg_runq_threads; /* (j) Num KSEs on runq. */
161 };
162 #define kg_last_assigned kg_sched->skg_last_assigned
163 #define kg_avail_opennings kg_sched->skg_avail_opennings
164 #define kg_concurrency kg_sched->skg_concurrency
165 #define kg_runq_threads kg_sched->skg_runq_threads
166 #define kg_runtime kg_sched->skg_runtime
167 #define kg_slptime kg_sched->skg_slptime
168
169 #define SLOT_RELEASE(kg) \
170 do { \
171 kg->kg_avail_opennings++; \
172 CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)", \
173 kg, \
174 kg->kg_concurrency, \
175 kg->kg_avail_opennings); \
176 /*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency), \
177 ("slots out of whack")); */ \
178 } while (0)
179
180 #define SLOT_USE(kg) \
181 do { \
182 kg->kg_avail_opennings--; \
183 CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)", \
184 kg, \
185 kg->kg_concurrency, \
186 kg->kg_avail_opennings); \
187 /*KASSERT((kg->kg_avail_opennings >= 0), \
188 ("slots out of whack"));*/ \
189 } while (0)
190
191 static struct kse kse0;
192 static struct kg_sched kg_sched0;
193
194 /*
195 * The priority is primarily determined by the interactivity score. Thus, we
196 * give lower(better) priorities to kse groups that use less CPU. The nice
197 * value is then directly added to this to allow nice to have some effect
198 * on latency.
199 *
200 * PRI_RANGE: Total priority range for timeshare threads.
201 * PRI_NRESV: Number of nice values.
202 * PRI_BASE: The start of the dynamic range.
203 */
204 #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
205 #define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1)
206 #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2)
207 #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE)
208 #define SCHED_PRI_INTERACT(score) \
209 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
210
211 /*
212 * These determine the interactivity of a process.
213 *
214 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
215 * before throttling back.
216 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time.
217 * INTERACT_MAX: Maximum interactivity value. Smaller is better.
218 * INTERACT_THRESH: Threshhold for placement on the current runq.
219 */
220 #define SCHED_SLP_RUN_MAX ((hz * 5) << 10)
221 #define SCHED_SLP_RUN_FORK ((hz / 2) << 10)
222 #define SCHED_INTERACT_MAX (100)
223 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
224 #define SCHED_INTERACT_THRESH (30)
225
226 /*
227 * These parameters and macros determine the size of the time slice that is
228 * granted to each thread.
229 *
230 * SLICE_MIN: Minimum time slice granted, in units of ticks.
231 * SLICE_MAX: Maximum time slice granted.
232 * SLICE_RANGE: Range of available time slices scaled by hz.
233 * SLICE_SCALE: The number slices granted per val in the range of [0, max].
234 * SLICE_NICE: Determine the amount of slice granted to a scaled nice.
235 * SLICE_NTHRESH: The nice cutoff point for slice assignment.
236 */
237 #define SCHED_SLICE_MIN (slice_min)
238 #define SCHED_SLICE_MAX (slice_max)
239 #define SCHED_SLICE_INTERACTIVE (slice_max)
240 #define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1)
241 #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
242 #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max))
243 #define SCHED_SLICE_NICE(nice) \
244 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
245
246 /*
247 * This macro determines whether or not the thread belongs on the current or
248 * next run queue.
249 */
250 #define SCHED_INTERACTIVE(kg) \
251 (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
252 #define SCHED_CURR(kg, ke) \
253 (ke->ke_thread->td_priority < kg->kg_user_pri || \
254 SCHED_INTERACTIVE(kg))
255
256 /*
257 * Cpu percentage computation macros and defines.
258 *
259 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across.
260 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across.
261 */
262
263 #define SCHED_CPU_TIME 10
264 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
265
266 /*
267 * kseq - per processor runqs and statistics.
268 */
269 struct kseq {
270 struct runq ksq_idle; /* Queue of IDLE threads. */
271 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */
272 struct runq *ksq_next; /* Next timeshare queue. */
273 struct runq *ksq_curr; /* Current queue. */
274 int ksq_load_timeshare; /* Load for timeshare. */
275 int ksq_load; /* Aggregate load. */
276 short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
277 short ksq_nicemin; /* Least nice. */
278 #ifdef SMP
279 int ksq_transferable;
280 LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */
281 struct kseq_group *ksq_group; /* Our processor group. */
282 volatile struct kse *ksq_assigned; /* assigned by another CPU. */
283 #else
284 int ksq_sysload; /* For loadavg, !ITHD load. */
285 #endif
286 };
287
288 #ifdef SMP
289 /*
290 * kseq groups are groups of processors which can cheaply share threads. When
291 * one processor in the group goes idle it will check the runqs of the other
292 * processors in its group prior to halting and waiting for an interrupt.
293 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
294 * In a numa environment we'd want an idle bitmap per group and a two tiered
295 * load balancer.
296 */
297 struct kseq_group {
298 int ksg_cpus; /* Count of CPUs in this kseq group. */
299 cpumask_t ksg_cpumask; /* Mask of cpus in this group. */
300 cpumask_t ksg_idlemask; /* Idle cpus in this group. */
301 cpumask_t ksg_mask; /* Bit mask for first cpu. */
302 int ksg_load; /* Total load of this group. */
303 int ksg_transferable; /* Transferable load of this group. */
304 LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */
305 };
306 #endif
307
308 /*
309 * One kse queue per processor.
310 */
311 #ifdef SMP
312 static cpumask_t kseq_idle;
313 static int ksg_maxid;
314 static struct kseq kseq_cpu[MAXCPU];
315 static struct kseq_group kseq_groups[MAXCPU];
316 static int bal_tick;
317 static int gbal_tick;
318
319 #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)])
320 #define KSEQ_CPU(x) (&kseq_cpu[(x)])
321 #define KSEQ_ID(x) ((x) - kseq_cpu)
322 #define KSEQ_GROUP(x) (&kseq_groups[(x)])
323 #else /* !SMP */
324 static struct kseq kseq_cpu;
325
326 #define KSEQ_SELF() (&kseq_cpu)
327 #define KSEQ_CPU(x) (&kseq_cpu)
328 #endif
329
330 static void slot_fill(struct ksegrp *kg);
331 static struct kse *sched_choose(void); /* XXX Should be thread * */
332 static void sched_add_internal(struct thread *td, int preemptive);
333 static void sched_slice(struct kse *ke);
334 static void sched_priority(struct ksegrp *kg);
335 static int sched_interact_score(struct ksegrp *kg);
336 static void sched_interact_update(struct ksegrp *kg);
337 static void sched_interact_fork(struct ksegrp *kg);
338 static void sched_pctcpu_update(struct kse *ke);
339
340 /* Operations on per processor queues */
341 static struct kse * kseq_choose(struct kseq *kseq);
342 static void kseq_setup(struct kseq *kseq);
343 static void kseq_load_add(struct kseq *kseq, struct kse *ke);
344 static void kseq_load_rem(struct kseq *kseq, struct kse *ke);
345 static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke);
346 static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke);
347 static void kseq_nice_add(struct kseq *kseq, int nice);
348 static void kseq_nice_rem(struct kseq *kseq, int nice);
349 void kseq_print(int cpu);
350 #ifdef SMP
351 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
352 static struct kse *runq_steal(struct runq *rq);
353 static void sched_balance(void);
354 static void sched_balance_groups(void);
355 static void sched_balance_group(struct kseq_group *ksg);
356 static void sched_balance_pair(struct kseq *high, struct kseq *low);
357 static void kseq_move(struct kseq *from, int cpu);
358 static int kseq_idled(struct kseq *kseq);
359 static void kseq_notify(struct kse *ke, int cpu);
360 static void kseq_assign(struct kseq *);
361 static struct kse *kseq_steal(struct kseq *kseq, int stealidle);
362 /*
363 * On P4 Xeons the round-robin interrupt delivery is broken. As a result of
364 * this, we can't pin interrupts to the cpu that they were delivered to,
365 * otherwise all ithreads only run on CPU 0.
366 */
367 #ifdef __i386__
368 #define KSE_CAN_MIGRATE(ke, class) \
369 ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
370 #else /* !__i386__ */
371 #define KSE_CAN_MIGRATE(ke, class) \
372 ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \
373 ((ke)->ke_flags & KEF_BOUND) == 0)
374 #endif /* !__i386__ */
375 #endif
376
377 void
378 kseq_print(int cpu)
379 {
380 struct kseq *kseq;
381 int i;
382
383 kseq = KSEQ_CPU(cpu);
384
385 printf("kseq:\n");
386 printf("\tload: %d\n", kseq->ksq_load);
387 printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
388 #ifdef SMP
389 printf("\tload transferable: %d\n", kseq->ksq_transferable);
390 #endif
391 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
392 printf("\tnice counts:\n");
393 for (i = 0; i < SCHED_PRI_NRESV; i++)
394 if (kseq->ksq_nice[i])
395 printf("\t\t%d = %d\n",
396 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
397 }
398
399 static __inline void
400 kseq_runq_add(struct kseq *kseq, struct kse *ke)
401 {
402 #ifdef SMP
403 if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) {
404 kseq->ksq_transferable++;
405 kseq->ksq_group->ksg_transferable++;
406 ke->ke_flags |= KEF_XFERABLE;
407 }
408 #endif
409 runq_add(ke->ke_runq, ke, 0);
410 }
411
412 static __inline void
413 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
414 {
415 #ifdef SMP
416 if (ke->ke_flags & KEF_XFERABLE) {
417 kseq->ksq_transferable--;
418 kseq->ksq_group->ksg_transferable--;
419 ke->ke_flags &= ~KEF_XFERABLE;
420 }
421 #endif
422 runq_remove(ke->ke_runq, ke);
423 }
424
425 static void
426 kseq_load_add(struct kseq *kseq, struct kse *ke)
427 {
428 int class;
429 mtx_assert(&sched_lock, MA_OWNED);
430 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
431 if (class == PRI_TIMESHARE)
432 kseq->ksq_load_timeshare++;
433 kseq->ksq_load++;
434 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
435 #ifdef SMP
436 kseq->ksq_group->ksg_load++;
437 #else
438 kseq->ksq_sysload++;
439 #endif
440 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
441 CTR6(KTR_ULE,
442 "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
443 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
444 ke->ke_proc->p_nice, kseq->ksq_nicemin);
445 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
446 kseq_nice_add(kseq, ke->ke_proc->p_nice);
447 }
448
449 static void
450 kseq_load_rem(struct kseq *kseq, struct kse *ke)
451 {
452 int class;
453 mtx_assert(&sched_lock, MA_OWNED);
454 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
455 if (class == PRI_TIMESHARE)
456 kseq->ksq_load_timeshare--;
457 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
458 #ifdef SMP
459 kseq->ksq_group->ksg_load--;
460 #else
461 kseq->ksq_sysload--;
462 #endif
463 kseq->ksq_load--;
464 ke->ke_runq = NULL;
465 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
466 kseq_nice_rem(kseq, ke->ke_proc->p_nice);
467 }
468
469 static void
470 kseq_nice_add(struct kseq *kseq, int nice)
471 {
472 mtx_assert(&sched_lock, MA_OWNED);
473 /* Normalize to zero. */
474 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
475 if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
476 kseq->ksq_nicemin = nice;
477 }
478
479 static void
480 kseq_nice_rem(struct kseq *kseq, int nice)
481 {
482 int n;
483
484 mtx_assert(&sched_lock, MA_OWNED);
485 /* Normalize to zero. */
486 n = nice + SCHED_PRI_NHALF;
487 kseq->ksq_nice[n]--;
488 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
489
490 /*
491 * If this wasn't the smallest nice value or there are more in
492 * this bucket we can just return. Otherwise we have to recalculate
493 * the smallest nice.
494 */
495 if (nice != kseq->ksq_nicemin ||
496 kseq->ksq_nice[n] != 0 ||
497 kseq->ksq_load_timeshare == 0)
498 return;
499
500 for (; n < SCHED_PRI_NRESV; n++)
501 if (kseq->ksq_nice[n]) {
502 kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
503 return;
504 }
505 }
506
507 #ifdef SMP
508 /*
509 * sched_balance is a simple CPU load balancing algorithm. It operates by
510 * finding the least loaded and most loaded cpu and equalizing their load
511 * by migrating some processes.
512 *
513 * Dealing only with two CPUs at a time has two advantages. Firstly, most
514 * installations will only have 2 cpus. Secondly, load balancing too much at
515 * once can have an unpleasant effect on the system. The scheduler rarely has
516 * enough information to make perfect decisions. So this algorithm chooses
517 * algorithm simplicity and more gradual effects on load in larger systems.
518 *
519 * It could be improved by considering the priorities and slices assigned to
520 * each task prior to balancing them. There are many pathological cases with
521 * any approach and so the semi random algorithm below may work as well as any.
522 *
523 */
524 static void
525 sched_balance(void)
526 {
527 struct kseq_group *high;
528 struct kseq_group *low;
529 struct kseq_group *ksg;
530 int cnt;
531 int i;
532
533 if (smp_started == 0)
534 goto out;
535 low = high = NULL;
536 i = random() % (ksg_maxid + 1);
537 for (cnt = 0; cnt <= ksg_maxid; cnt++) {
538 ksg = KSEQ_GROUP(i);
539 /*
540 * Find the CPU with the highest load that has some
541 * threads to transfer.
542 */
543 if ((high == NULL || ksg->ksg_load > high->ksg_load)
544 && ksg->ksg_transferable)
545 high = ksg;
546 if (low == NULL || ksg->ksg_load < low->ksg_load)
547 low = ksg;
548 if (++i > ksg_maxid)
549 i = 0;
550 }
551 if (low != NULL && high != NULL && high != low)
552 sched_balance_pair(LIST_FIRST(&high->ksg_members),
553 LIST_FIRST(&low->ksg_members));
554 out:
555 bal_tick = ticks + (random() % (hz * 2));
556 }
557
558 static void
559 sched_balance_groups(void)
560 {
561 int i;
562
563 mtx_assert(&sched_lock, MA_OWNED);
564 if (smp_started)
565 for (i = 0; i <= ksg_maxid; i++)
566 sched_balance_group(KSEQ_GROUP(i));
567 gbal_tick = ticks + (random() % (hz * 2));
568 }
569
570 static void
571 sched_balance_group(struct kseq_group *ksg)
572 {
573 struct kseq *kseq;
574 struct kseq *high;
575 struct kseq *low;
576 int load;
577
578 if (ksg->ksg_transferable == 0)
579 return;
580 low = NULL;
581 high = NULL;
582 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
583 load = kseq->ksq_load;
584 if (high == NULL || load > high->ksq_load)
585 high = kseq;
586 if (low == NULL || load < low->ksq_load)
587 low = kseq;
588 }
589 if (high != NULL && low != NULL && high != low)
590 sched_balance_pair(high, low);
591 }
592
593 static void
594 sched_balance_pair(struct kseq *high, struct kseq *low)
595 {
596 int transferable;
597 int high_load;
598 int low_load;
599 int move;
600 int diff;
601 int i;
602
603 /*
604 * If we're transfering within a group we have to use this specific
605 * kseq's transferable count, otherwise we can steal from other members
606 * of the group.
607 */
608 if (high->ksq_group == low->ksq_group) {
609 transferable = high->ksq_transferable;
610 high_load = high->ksq_load;
611 low_load = low->ksq_load;
612 } else {
613 transferable = high->ksq_group->ksg_transferable;
614 high_load = high->ksq_group->ksg_load;
615 low_load = low->ksq_group->ksg_load;
616 }
617 if (transferable == 0)
618 return;
619 /*
620 * Determine what the imbalance is and then adjust that to how many
621 * kses we actually have to give up (transferable).
622 */
623 diff = high_load - low_load;
624 move = diff / 2;
625 if (diff & 0x1)
626 move++;
627 move = min(move, transferable);
628 for (i = 0; i < move; i++)
629 kseq_move(high, KSEQ_ID(low));
630 return;
631 }
632
633 static void
634 kseq_move(struct kseq *from, int cpu)
635 {
636 struct kseq *kseq;
637 struct kseq *to;
638 struct kse *ke;
639
640 kseq = from;
641 to = KSEQ_CPU(cpu);
642 ke = kseq_steal(kseq, 1);
643 if (ke == NULL) {
644 struct kseq_group *ksg;
645
646 ksg = kseq->ksq_group;
647 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
648 if (kseq == from || kseq->ksq_transferable == 0)
649 continue;
650 ke = kseq_steal(kseq, 1);
651 break;
652 }
653 if (ke == NULL)
654 panic("kseq_move: No KSEs available with a "
655 "transferable count of %d\n",
656 ksg->ksg_transferable);
657 }
658 if (kseq == to)
659 return;
660 ke->ke_state = KES_THREAD;
661 kseq_runq_rem(kseq, ke);
662 kseq_load_rem(kseq, ke);
663 kseq_notify(ke, cpu);
664 }
665
666 static int
667 kseq_idled(struct kseq *kseq)
668 {
669 struct kseq_group *ksg;
670 struct kseq *steal;
671 struct kse *ke;
672
673 ksg = kseq->ksq_group;
674 /*
675 * If we're in a cpu group, try and steal kses from another cpu in
676 * the group before idling.
677 */
678 if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
679 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
680 if (steal == kseq || steal->ksq_transferable == 0)
681 continue;
682 ke = kseq_steal(steal, 0);
683 if (ke == NULL)
684 continue;
685 ke->ke_state = KES_THREAD;
686 kseq_runq_rem(steal, ke);
687 kseq_load_rem(steal, ke);
688 ke->ke_cpu = PCPU_GET(cpuid);
689 sched_add_internal(ke->ke_thread, 0);
690 return (0);
691 }
692 }
693 /*
694 * We only set the idled bit when all of the cpus in the group are
695 * idle. Otherwise we could get into a situation where a KSE bounces
696 * back and forth between two idle cores on seperate physical CPUs.
697 */
698 ksg->ksg_idlemask |= PCPU_GET(cpumask);
699 if (ksg->ksg_idlemask != ksg->ksg_cpumask)
700 return (1);
701 atomic_set_int(&kseq_idle, ksg->ksg_mask);
702 return (1);
703 }
704
705 static void
706 kseq_assign(struct kseq *kseq)
707 {
708 struct kse *nke;
709 struct kse *ke;
710
711 do {
712 *(volatile struct kse **)&ke = kseq->ksq_assigned;
713 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
714 for (; ke != NULL; ke = nke) {
715 nke = ke->ke_assign;
716 ke->ke_flags &= ~KEF_ASSIGNED;
717 sched_add_internal(ke->ke_thread, 0);
718 }
719 }
720
721 static void
722 kseq_notify(struct kse *ke, int cpu)
723 {
724 struct kseq *kseq;
725 struct thread *td;
726 struct pcpu *pcpu;
727 int prio;
728
729 ke->ke_cpu = cpu;
730 ke->ke_flags |= KEF_ASSIGNED;
731 prio = ke->ke_thread->td_priority;
732
733 kseq = KSEQ_CPU(cpu);
734
735 /*
736 * Place a KSE on another cpu's queue and force a resched.
737 */
738 do {
739 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
740 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
741 /*
742 * Without sched_lock we could lose a race where we set NEEDRESCHED
743 * on a thread that is switched out before the IPI is delivered. This
744 * would lead us to miss the resched. This will be a problem once
745 * sched_lock is pushed down.
746 */
747 pcpu = pcpu_find(cpu);
748 td = pcpu->pc_curthread;
749 if (ke->ke_thread->td_priority < td->td_priority ||
750 td == pcpu->pc_idlethread) {
751 td->td_flags |= TDF_NEEDRESCHED;
752 ipi_selected(1 << cpu, IPI_AST);
753 }
754 }
755
756 static struct kse *
757 runq_steal(struct runq *rq)
758 {
759 struct rqhead *rqh;
760 struct rqbits *rqb;
761 struct kse *ke;
762 int word;
763 int bit;
764
765 mtx_assert(&sched_lock, MA_OWNED);
766 rqb = &rq->rq_status;
767 for (word = 0; word < RQB_LEN; word++) {
768 if (rqb->rqb_bits[word] == 0)
769 continue;
770 for (bit = 0; bit < RQB_BPW; bit++) {
771 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
772 continue;
773 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
774 TAILQ_FOREACH(ke, rqh, ke_procq) {
775 if (KSE_CAN_MIGRATE(ke,
776 PRI_BASE(ke->ke_ksegrp->kg_pri_class)))
777 return (ke);
778 }
779 }
780 }
781 return (NULL);
782 }
783
784 static struct kse *
785 kseq_steal(struct kseq *kseq, int stealidle)
786 {
787 struct kse *ke;
788
789 /*
790 * Steal from next first to try to get a non-interactive task that
791 * may not have run for a while.
792 */
793 if ((ke = runq_steal(kseq->ksq_next)) != NULL)
794 return (ke);
795 if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
796 return (ke);
797 if (stealidle)
798 return (runq_steal(&kseq->ksq_idle));
799 return (NULL);
800 }
801
802 int
803 kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
804 {
805 struct kseq_group *ksg;
806 int cpu;
807
808 if (smp_started == 0)
809 return (0);
810 cpu = 0;
811 /*
812 * If our load exceeds a certain threshold we should attempt to
813 * reassign this thread. The first candidate is the cpu that
814 * originally ran the thread. If it is idle, assign it there,
815 * otherwise, pick an idle cpu.
816 *
817 * The threshold at which we start to reassign kses has a large impact
818 * on the overall performance of the system. Tuned too high and
819 * some CPUs may idle. Too low and there will be excess migration
820 * and context switches.
821 */
822 ksg = kseq->ksq_group;
823 if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) {
824 ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group;
825 if (kseq_idle & ksg->ksg_mask) {
826 cpu = ffs(ksg->ksg_idlemask);
827 if (cpu)
828 goto migrate;
829 }
830 /*
831 * Multiple cpus could find this bit simultaneously
832 * but the race shouldn't be terrible.
833 */
834 cpu = ffs(kseq_idle);
835 if (cpu)
836 goto migrate;
837 }
838 /*
839 * If another cpu in this group has idled, assign a thread over
840 * to them after checking to see if there are idled groups.
841 */
842 ksg = kseq->ksq_group;
843 if (ksg->ksg_idlemask) {
844 cpu = ffs(ksg->ksg_idlemask);
845 if (cpu)
846 goto migrate;
847 }
848 /*
849 * No new CPU was found.
850 */
851 return (0);
852 migrate:
853 /*
854 * Now that we've found an idle CPU, migrate the thread.
855 */
856 cpu--;
857 ke->ke_runq = NULL;
858 kseq_notify(ke, cpu);
859
860 return (1);
861 }
862
863 #endif /* SMP */
864
865 /*
866 * Pick the highest priority task we have and return it.
867 */
868
869 static struct kse *
870 kseq_choose(struct kseq *kseq)
871 {
872 struct kse *ke;
873 struct runq *swap;
874
875 mtx_assert(&sched_lock, MA_OWNED);
876 swap = NULL;
877
878 for (;;) {
879 ke = runq_choose(kseq->ksq_curr);
880 if (ke == NULL) {
881 /*
882 * We already swapped once and didn't get anywhere.
883 */
884 if (swap)
885 break;
886 swap = kseq->ksq_curr;
887 kseq->ksq_curr = kseq->ksq_next;
888 kseq->ksq_next = swap;
889 continue;
890 }
891 /*
892 * If we encounter a slice of 0 the kse is in a
893 * TIMESHARE kse group and its nice was too far out
894 * of the range that receives slices.
895 */
896 if (ke->ke_slice == 0) {
897 runq_remove(ke->ke_runq, ke);
898 sched_slice(ke);
899 ke->ke_runq = kseq->ksq_next;
900 runq_add(ke->ke_runq, ke, 0);
901 continue;
902 }
903 return (ke);
904 }
905
906 return (runq_choose(&kseq->ksq_idle));
907 }
908
909 static void
910 kseq_setup(struct kseq *kseq)
911 {
912 runq_init(&kseq->ksq_timeshare[0]);
913 runq_init(&kseq->ksq_timeshare[1]);
914 runq_init(&kseq->ksq_idle);
915 kseq->ksq_curr = &kseq->ksq_timeshare[0];
916 kseq->ksq_next = &kseq->ksq_timeshare[1];
917 kseq->ksq_load = 0;
918 kseq->ksq_load_timeshare = 0;
919 }
920
921 static void
922 sched_setup(void *dummy)
923 {
924 #ifdef SMP
925 int balance_groups;
926 int i;
927 #endif
928
929 slice_min = (hz/100); /* 10ms */
930 slice_max = (hz/7); /* ~140ms */
931
932 #ifdef SMP
933 balance_groups = 0;
934 /*
935 * Initialize the kseqs.
936 */
937 for (i = 0; i < MAXCPU; i++) {
938 struct kseq *ksq;
939
940 ksq = &kseq_cpu[i];
941 ksq->ksq_assigned = NULL;
942 kseq_setup(&kseq_cpu[i]);
943 }
944 if (smp_topology == NULL) {
945 struct kseq_group *ksg;
946 struct kseq *ksq;
947
948 for (i = 0; i < MAXCPU; i++) {
949 ksq = &kseq_cpu[i];
950 ksg = &kseq_groups[i];
951 /*
952 * Setup a kseq group with one member.
953 */
954 ksq->ksq_transferable = 0;
955 ksq->ksq_group = ksg;
956 ksg->ksg_cpus = 1;
957 ksg->ksg_idlemask = 0;
958 ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
959 ksg->ksg_load = 0;
960 ksg->ksg_transferable = 0;
961 LIST_INIT(&ksg->ksg_members);
962 LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
963 }
964 } else {
965 struct kseq_group *ksg;
966 struct cpu_group *cg;
967 int j;
968
969 for (i = 0; i < smp_topology->ct_count; i++) {
970 cg = &smp_topology->ct_group[i];
971 ksg = &kseq_groups[i];
972 /*
973 * Initialize the group.
974 */
975 ksg->ksg_idlemask = 0;
976 ksg->ksg_load = 0;
977 ksg->ksg_transferable = 0;
978 ksg->ksg_cpus = cg->cg_count;
979 ksg->ksg_cpumask = cg->cg_mask;
980 LIST_INIT(&ksg->ksg_members);
981 /*
982 * Find all of the group members and add them.
983 */
984 for (j = 0; j < MAXCPU; j++) {
985 if ((cg->cg_mask & (1 << j)) != 0) {
986 if (ksg->ksg_mask == 0)
987 ksg->ksg_mask = 1 << j;
988 kseq_cpu[j].ksq_transferable = 0;
989 kseq_cpu[j].ksq_group = ksg;
990 LIST_INSERT_HEAD(&ksg->ksg_members,
991 &kseq_cpu[j], ksq_siblings);
992 }
993 }
994 if (ksg->ksg_cpus > 1)
995 balance_groups = 1;
996 }
997 ksg_maxid = smp_topology->ct_count - 1;
998 }
999 /*
1000 * Stagger the group and global load balancer so they do not
1001 * interfere with each other.
1002 */
1003 bal_tick = ticks + hz;
1004 if (balance_groups)
1005 gbal_tick = ticks + (hz / 2);
1006 #else
1007 kseq_setup(KSEQ_SELF());
1008 #endif
1009 mtx_lock_spin(&sched_lock);
1010 kseq_load_add(KSEQ_SELF(), &kse0);
1011 mtx_unlock_spin(&sched_lock);
1012 }
1013
1014 /*
1015 * Scale the scheduling priority according to the "interactivity" of this
1016 * process.
1017 */
1018 static void
1019 sched_priority(struct ksegrp *kg)
1020 {
1021 int pri;
1022
1023 if (kg->kg_pri_class != PRI_TIMESHARE)
1024 return;
1025
1026 pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
1027 pri += SCHED_PRI_BASE;
1028 pri += kg->kg_proc->p_nice;
1029
1030 if (pri > PRI_MAX_TIMESHARE)
1031 pri = PRI_MAX_TIMESHARE;
1032 else if (pri < PRI_MIN_TIMESHARE)
1033 pri = PRI_MIN_TIMESHARE;
1034
1035 kg->kg_user_pri = pri;
1036
1037 return;
1038 }
1039
1040 /*
1041 * Calculate a time slice based on the properties of the kseg and the runq
1042 * that we're on. This is only for PRI_TIMESHARE ksegrps.
1043 */
1044 static void
1045 sched_slice(struct kse *ke)
1046 {
1047 struct kseq *kseq;
1048 struct ksegrp *kg;
1049
1050 kg = ke->ke_ksegrp;
1051 kseq = KSEQ_CPU(ke->ke_cpu);
1052
1053 /*
1054 * Rationale:
1055 * KSEs in interactive ksegs get a minimal slice so that we
1056 * quickly notice if it abuses its advantage.
1057 *
1058 * KSEs in non-interactive ksegs are assigned a slice that is
1059 * based on the ksegs nice value relative to the least nice kseg
1060 * on the run queue for this cpu.
1061 *
1062 * If the KSE is less nice than all others it gets the maximum
1063 * slice and other KSEs will adjust their slice relative to
1064 * this when they first expire.
1065 *
1066 * There is 20 point window that starts relative to the least
1067 * nice kse on the run queue. Slice size is determined by
1068 * the kse distance from the last nice ksegrp.
1069 *
1070 * If the kse is outside of the window it will get no slice
1071 * and will be reevaluated each time it is selected on the
1072 * run queue. The exception to this is nice 0 ksegs when
1073 * a nice -20 is running. They are always granted a minimum
1074 * slice.
1075 */
1076 if (!SCHED_INTERACTIVE(kg)) {
1077 int nice;
1078
1079 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
1080 if (kseq->ksq_load_timeshare == 0 ||
1081 kg->kg_proc->p_nice < kseq->ksq_nicemin)
1082 ke->ke_slice = SCHED_SLICE_MAX;
1083 else if (nice <= SCHED_SLICE_NTHRESH)
1084 ke->ke_slice = SCHED_SLICE_NICE(nice);
1085 else if (kg->kg_proc->p_nice == 0)
1086 ke->ke_slice = SCHED_SLICE_MIN;
1087 else
1088 ke->ke_slice = 0;
1089 } else
1090 ke->ke_slice = SCHED_SLICE_INTERACTIVE;
1091
1092 CTR6(KTR_ULE,
1093 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
1094 ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin,
1095 kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg));
1096
1097 return;
1098 }
1099
1100 /*
1101 * This routine enforces a maximum limit on the amount of scheduling history
1102 * kept. It is called after either the slptime or runtime is adjusted.
1103 * This routine will not operate correctly when slp or run times have been
1104 * adjusted to more than double their maximum.
1105 */
1106 static void
1107 sched_interact_update(struct ksegrp *kg)
1108 {
1109 int sum;
1110
1111 sum = kg->kg_runtime + kg->kg_slptime;
1112 if (sum < SCHED_SLP_RUN_MAX)
1113 return;
1114 /*
1115 * If we have exceeded by more than 1/5th then the algorithm below
1116 * will not bring us back into range. Dividing by two here forces
1117 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1118 */
1119 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1120 kg->kg_runtime /= 2;
1121 kg->kg_slptime /= 2;
1122 return;
1123 }
1124 kg->kg_runtime = (kg->kg_runtime / 5) * 4;
1125 kg->kg_slptime = (kg->kg_slptime / 5) * 4;
1126 }
1127
1128 static void
1129 sched_interact_fork(struct ksegrp *kg)
1130 {
1131 int ratio;
1132 int sum;
1133
1134 sum = kg->kg_runtime + kg->kg_slptime;
1135 if (sum > SCHED_SLP_RUN_FORK) {
1136 ratio = sum / SCHED_SLP_RUN_FORK;
1137 kg->kg_runtime /= ratio;
1138 kg->kg_slptime /= ratio;
1139 }
1140 }
1141
1142 static int
1143 sched_interact_score(struct ksegrp *kg)
1144 {
1145 int div;
1146
1147 if (kg->kg_runtime > kg->kg_slptime) {
1148 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
1149 return (SCHED_INTERACT_HALF +
1150 (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
1151 } if (kg->kg_slptime > kg->kg_runtime) {
1152 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
1153 return (kg->kg_runtime / div);
1154 }
1155
1156 /*
1157 * This can happen if slptime and runtime are 0.
1158 */
1159 return (0);
1160
1161 }
1162
1163 /*
1164 * Very early in the boot some setup of scheduler-specific
1165 * parts of proc0 and of soem scheduler resources needs to be done.
1166 * Called from:
1167 * proc0_init()
1168 */
1169 void
1170 schedinit(void)
1171 {
1172 /*
1173 * Set up the scheduler specific parts of proc0.
1174 */
1175 proc0.p_sched = NULL; /* XXX */
1176 ksegrp0.kg_sched = &kg_sched0;
1177 thread0.td_sched = &kse0;
1178 kse0.ke_thread = &thread0;
1179 kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */
1180 kse0.ke_state = KES_THREAD;
1181 kg_sched0.skg_concurrency = 1;
1182 kg_sched0.skg_avail_opennings = 0; /* we are already running */
1183 }
1184
1185 /*
1186 * This is only somewhat accurate since given many processes of the same
1187 * priority they will switch when their slices run out, which will be
1188 * at most SCHED_SLICE_MAX.
1189 */
1190 int
1191 sched_rr_interval(void)
1192 {
1193 return (SCHED_SLICE_MAX);
1194 }
1195
1196 static void
1197 sched_pctcpu_update(struct kse *ke)
1198 {
1199 /*
1200 * Adjust counters and watermark for pctcpu calc.
1201 */
1202 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
1203 /*
1204 * Shift the tick count out so that the divide doesn't
1205 * round away our results.
1206 */
1207 ke->ke_ticks <<= 10;
1208 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
1209 SCHED_CPU_TICKS;
1210 ke->ke_ticks >>= 10;
1211 } else
1212 ke->ke_ticks = 0;
1213 ke->ke_ltick = ticks;
1214 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
1215 }
1216
1217 void
1218 sched_prio(struct thread *td, u_char prio)
1219 {
1220 struct kse *ke;
1221
1222 ke = td->td_kse;
1223 mtx_assert(&sched_lock, MA_OWNED);
1224 if (TD_ON_RUNQ(td)) {
1225 /*
1226 * If the priority has been elevated due to priority
1227 * propagation, we may have to move ourselves to a new
1228 * queue. We still call adjustrunqueue below in case kse
1229 * needs to fix things up.
1230 */
1231 if (prio < td->td_priority && ke &&
1232 (ke->ke_flags & KEF_ASSIGNED) == 0 &&
1233 ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
1234 runq_remove(ke->ke_runq, ke);
1235 ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
1236 runq_add(ke->ke_runq, ke, 0);
1237 }
1238 /*
1239 * Hold this kse on this cpu so that sched_prio() doesn't
1240 * cause excessive migration. We only want migration to
1241 * happen as the result of a wakeup.
1242 */
1243 ke->ke_flags |= KEF_HOLD;
1244 adjustrunqueue(td, prio);
1245 } else
1246 td->td_priority = prio;
1247 }
1248
1249 void
1250 sched_switch(struct thread *td, struct thread *newtd, int flags)
1251 {
1252 struct kse *ke;
1253
1254 mtx_assert(&sched_lock, MA_OWNED);
1255
1256 ke = td->td_kse;
1257
1258 td->td_lastcpu = td->td_oncpu;
1259 td->td_oncpu = NOCPU;
1260 td->td_flags &= ~TDF_NEEDRESCHED;
1261 td->td_pflags &= ~TDP_OWEPREEMPT;
1262
1263 /*
1264 * If the KSE has been assigned it may be in the process of switching
1265 * to the new cpu. This is the case in sched_bind().
1266 */
1267 if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
1268 if (td == PCPU_GET(idlethread)) {
1269 TD_SET_CAN_RUN(td);
1270 } else {
1271 /* We are ending our run so make our slot available again */
1272 SLOT_RELEASE(td->td_ksegrp);
1273 if (TD_IS_RUNNING(td)) {
1274 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
1275 /*
1276 * Don't allow the thread to migrate
1277 * from a preemption.
1278 */
1279 ke->ke_flags |= KEF_HOLD;
1280 setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
1281 } else {
1282 if (ke->ke_runq) {
1283 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
1284 } else if ((td->td_flags & TDF_IDLETD) == 0)
1285 kdb_backtrace();
1286 /*
1287 * We will not be on the run queue.
1288 * So we must be sleeping or similar.
1289 * Don't use the slot if we will need it
1290 * for newtd.
1291 */
1292 if ((td->td_proc->p_flag & P_HADTHREADS) &&
1293 (newtd == NULL ||
1294 newtd->td_ksegrp != td->td_ksegrp))
1295 slot_fill(td->td_ksegrp);
1296 }
1297 }
1298 }
1299 if (newtd != NULL) {
1300 /*
1301 * If we bring in a thread,
1302 * then account for it as if it had been added to the
1303 * run queue and then chosen.
1304 */
1305 newtd->td_kse->ke_flags |= KEF_DIDRUN;
1306 SLOT_USE(newtd->td_ksegrp);
1307 TD_SET_RUNNING(newtd);
1308 kseq_load_add(KSEQ_SELF(), newtd->td_kse);
1309 } else
1310 newtd = choosethread();
1311 if (td != newtd)
1312 cpu_switch(td, newtd);
1313 sched_lock.mtx_lock = (uintptr_t)td;
1314
1315 td->td_oncpu = PCPU_GET(cpuid);
1316 }
1317
1318 void
1319 sched_nice(struct proc *p, int nice)
1320 {
1321 struct ksegrp *kg;
1322 struct kse *ke;
1323 struct thread *td;
1324 struct kseq *kseq;
1325
1326 PROC_LOCK_ASSERT(p, MA_OWNED);
1327 mtx_assert(&sched_lock, MA_OWNED);
1328 /*
1329 * We need to adjust the nice counts for running KSEs.
1330 */
1331 FOREACH_KSEGRP_IN_PROC(p, kg) {
1332 if (kg->kg_pri_class == PRI_TIMESHARE) {
1333 FOREACH_THREAD_IN_GROUP(kg, td) {
1334 ke = td->td_kse;
1335 if (ke->ke_runq == NULL)
1336 continue;
1337 kseq = KSEQ_CPU(ke->ke_cpu);
1338 kseq_nice_rem(kseq, p->p_nice);
1339 kseq_nice_add(kseq, nice);
1340 }
1341 }
1342 }
1343 p->p_nice = nice;
1344 FOREACH_KSEGRP_IN_PROC(p, kg) {
1345 sched_priority(kg);
1346 FOREACH_THREAD_IN_GROUP(kg, td)
1347 td->td_flags |= TDF_NEEDRESCHED;
1348 }
1349 }
1350
1351 void
1352 sched_sleep(struct thread *td)
1353 {
1354 mtx_assert(&sched_lock, MA_OWNED);
1355
1356 td->td_slptime = ticks;
1357 td->td_base_pri = td->td_priority;
1358
1359 CTR2(KTR_ULE, "sleep thread %p (tick: %d)",
1360 td, td->td_slptime);
1361 }
1362
1363 void
1364 sched_wakeup(struct thread *td)
1365 {
1366 mtx_assert(&sched_lock, MA_OWNED);
1367
1368 /*
1369 * Let the kseg know how long we slept for. This is because process
1370 * interactivity behavior is modeled in the kseg.
1371 */
1372 if (td->td_slptime) {
1373 struct ksegrp *kg;
1374 int hzticks;
1375
1376 kg = td->td_ksegrp;
1377 hzticks = (ticks - td->td_slptime) << 10;
1378 if (hzticks >= SCHED_SLP_RUN_MAX) {
1379 kg->kg_slptime = SCHED_SLP_RUN_MAX;
1380 kg->kg_runtime = 1;
1381 } else {
1382 kg->kg_slptime += hzticks;
1383 sched_interact_update(kg);
1384 }
1385 sched_priority(kg);
1386 sched_slice(td->td_kse);
1387 CTR2(KTR_ULE, "wakeup thread %p (%d ticks)", td, hzticks);
1388 td->td_slptime = 0;
1389 }
1390 setrunqueue(td, SRQ_BORING);
1391 }
1392
1393 /*
1394 * Penalize the parent for creating a new child and initialize the child's
1395 * priority.
1396 */
1397 void
1398 sched_fork(struct thread *td, struct thread *childtd)
1399 {
1400
1401 mtx_assert(&sched_lock, MA_OWNED);
1402
1403 sched_fork_ksegrp(td, childtd->td_ksegrp);
1404 sched_fork_thread(td, childtd);
1405 }
1406
1407 void
1408 sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
1409 {
1410 struct ksegrp *kg = td->td_ksegrp;
1411 mtx_assert(&sched_lock, MA_OWNED);
1412
1413 child->kg_slptime = kg->kg_slptime;
1414 child->kg_runtime = kg->kg_runtime;
1415 child->kg_user_pri = kg->kg_user_pri;
1416 sched_interact_fork(child);
1417 kg->kg_runtime += tickincr << 10;
1418 sched_interact_update(kg);
1419
1420 CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)",
1421 kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime,
1422 child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime);
1423 }
1424
1425 void
1426 sched_fork_thread(struct thread *td, struct thread *child)
1427 {
1428 struct kse *ke;
1429 struct kse *ke2;
1430
1431 sched_newthread(child);
1432 ke = td->td_kse;
1433 ke2 = child->td_kse;
1434 ke2->ke_slice = 1; /* Attempt to quickly learn interactivity. */
1435 ke2->ke_cpu = ke->ke_cpu;
1436 ke2->ke_runq = NULL;
1437
1438 /* Grab our parents cpu estimation information. */
1439 ke2->ke_ticks = ke->ke_ticks;
1440 ke2->ke_ltick = ke->ke_ltick;
1441 ke2->ke_ftick = ke->ke_ftick;
1442 }
1443
1444 void
1445 sched_class(struct ksegrp *kg, int class)
1446 {
1447 struct kseq *kseq;
1448 struct kse *ke;
1449 struct thread *td;
1450 int nclass;
1451 int oclass;
1452
1453 mtx_assert(&sched_lock, MA_OWNED);
1454 if (kg->kg_pri_class == class)
1455 return;
1456
1457 nclass = PRI_BASE(class);
1458 oclass = PRI_BASE(kg->kg_pri_class);
1459 FOREACH_THREAD_IN_GROUP(kg, td) {
1460 ke = td->td_kse;
1461 if (ke->ke_state != KES_ONRUNQ &&
1462 ke->ke_state != KES_THREAD)
1463 continue;
1464 kseq = KSEQ_CPU(ke->ke_cpu);
1465
1466 #ifdef SMP
1467 /*
1468 * On SMP if we're on the RUNQ we must adjust the transferable
1469 * count because could be changing to or from an interrupt
1470 * class.
1471 */
1472 if (ke->ke_state == KES_ONRUNQ) {
1473 if (KSE_CAN_MIGRATE(ke, oclass)) {
1474 kseq->ksq_transferable--;
1475 kseq->ksq_group->ksg_transferable--;
1476 }
1477 if (KSE_CAN_MIGRATE(ke, nclass)) {
1478 kseq->ksq_transferable++;
1479 kseq->ksq_group->ksg_transferable++;
1480 }
1481 }
1482 #endif
1483 if (oclass == PRI_TIMESHARE) {
1484 kseq->ksq_load_timeshare--;
1485 kseq_nice_rem(kseq, kg->kg_proc->p_nice);
1486 }
1487 if (nclass == PRI_TIMESHARE) {
1488 kseq->ksq_load_timeshare++;
1489 kseq_nice_add(kseq, kg->kg_proc->p_nice);
1490 }
1491 }
1492
1493 kg->kg_pri_class = class;
1494 }
1495
1496 /*
1497 * Return some of the child's priority and interactivity to the parent.
1498 * Avoid using sched_exit_thread to avoid having to decide which
1499 * thread in the parent gets the honour since it isn't used.
1500 */
1501 void
1502 sched_exit(struct proc *p, struct thread *childtd)
1503 {
1504 mtx_assert(&sched_lock, MA_OWNED);
1505 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
1506 kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
1507 }
1508
1509 void
1510 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
1511 {
1512 /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
1513 kg->kg_runtime += td->td_ksegrp->kg_runtime;
1514 sched_interact_update(kg);
1515 }
1516
1517 void
1518 sched_exit_thread(struct thread *td, struct thread *childtd)
1519 {
1520 kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
1521 }
1522
1523 void
1524 sched_clock(struct thread *td)
1525 {
1526 struct kseq *kseq;
1527 struct ksegrp *kg;
1528 struct kse *ke;
1529
1530 mtx_assert(&sched_lock, MA_OWNED);
1531 kseq = KSEQ_SELF();
1532 #ifdef SMP
1533 if (ticks == bal_tick)
1534 sched_balance();
1535 if (ticks == gbal_tick)
1536 sched_balance_groups();
1537 /*
1538 * We could have been assigned a non real-time thread without an
1539 * IPI.
1540 */
1541 if (kseq->ksq_assigned)
1542 kseq_assign(kseq); /* Potentially sets NEEDRESCHED */
1543 #endif
1544 /*
1545 * sched_setup() apparently happens prior to stathz being set. We
1546 * need to resolve the timers earlier in the boot so we can avoid
1547 * calculating this here.
1548 */
1549 if (realstathz == 0) {
1550 realstathz = stathz ? stathz : hz;
1551 tickincr = hz / realstathz;
1552 /*
1553 * XXX This does not work for values of stathz that are much
1554 * larger than hz.
1555 */
1556 if (tickincr == 0)
1557 tickincr = 1;
1558 }
1559
1560 ke = td->td_kse;
1561 kg = ke->ke_ksegrp;
1562
1563 /* Adjust ticks for pctcpu */
1564 ke->ke_ticks++;
1565 ke->ke_ltick = ticks;
1566
1567 /* Go up to one second beyond our max and then trim back down */
1568 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1569 sched_pctcpu_update(ke);
1570
1571 if (td->td_flags & TDF_IDLETD)
1572 return;
1573
1574 CTR4(KTR_ULE, "Tick thread %p (slice: %d, slptime: %d, runtime: %d)",
1575 td, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
1576 /*
1577 * We only do slicing code for TIMESHARE ksegrps.
1578 */
1579 if (kg->kg_pri_class != PRI_TIMESHARE)
1580 return;
1581 /*
1582 * We used a tick charge it to the ksegrp so that we can compute our
1583 * interactivity.
1584 */
1585 kg->kg_runtime += tickincr << 10;
1586 sched_interact_update(kg);
1587
1588 /*
1589 * We used up one time slice.
1590 */
1591 if (--ke->ke_slice > 0)
1592 return;
1593 /*
1594 * We're out of time, recompute priorities and requeue.
1595 */
1596 kseq_load_rem(kseq, ke);
1597 sched_priority(kg);
1598 sched_slice(ke);
1599 if (SCHED_CURR(kg, ke))
1600 ke->ke_runq = kseq->ksq_curr;
1601 else
1602 ke->ke_runq = kseq->ksq_next;
1603 kseq_load_add(kseq, ke);
1604 td->td_flags |= TDF_NEEDRESCHED;
1605 }
1606
1607 int
1608 sched_runnable(void)
1609 {
1610 struct kseq *kseq;
1611 int load;
1612
1613 load = 1;
1614
1615 kseq = KSEQ_SELF();
1616 #ifdef SMP
1617 if (kseq->ksq_assigned) {
1618 mtx_lock_spin(&sched_lock);
1619 kseq_assign(kseq);
1620 mtx_unlock_spin(&sched_lock);
1621 }
1622 #endif
1623 if ((curthread->td_flags & TDF_IDLETD) != 0) {
1624 if (kseq->ksq_load > 0)
1625 goto out;
1626 } else
1627 if (kseq->ksq_load - 1 > 0)
1628 goto out;
1629 load = 0;
1630 out:
1631 return (load);
1632 }
1633
1634 void
1635 sched_userret(struct thread *td)
1636 {
1637 struct ksegrp *kg;
1638
1639 kg = td->td_ksegrp;
1640
1641 if (td->td_priority != kg->kg_user_pri) {
1642 mtx_lock_spin(&sched_lock);
1643 td->td_priority = kg->kg_user_pri;
1644 mtx_unlock_spin(&sched_lock);
1645 }
1646 }
1647
1648 struct kse *
1649 sched_choose(void)
1650 {
1651 struct kseq *kseq;
1652 struct kse *ke;
1653
1654 mtx_assert(&sched_lock, MA_OWNED);
1655 kseq = KSEQ_SELF();
1656 #ifdef SMP
1657 restart:
1658 if (kseq->ksq_assigned)
1659 kseq_assign(kseq);
1660 #endif
1661 ke = kseq_choose(kseq);
1662 if (ke) {
1663 #ifdef SMP
1664 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
1665 if (kseq_idled(kseq) == 0)
1666 goto restart;
1667 #endif
1668 kseq_runq_rem(kseq, ke);
1669 ke->ke_state = KES_THREAD;
1670
1671 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1672 CTR4(KTR_ULE, "Run thread %p from %p (slice: %d, pri: %d)",
1673 ke->ke_thread, ke->ke_runq, ke->ke_slice,
1674 ke->ke_thread->td_priority);
1675 }
1676 return (ke);
1677 }
1678 #ifdef SMP
1679 if (kseq_idled(kseq) == 0)
1680 goto restart;
1681 #endif
1682 return (NULL);
1683 }
1684
1685 void
1686 sched_add(struct thread *td, int flags)
1687 {
1688
1689 /* let jeff work out how to map the flags better */
1690 /* I'm open to suggestions */
1691 if (flags & SRQ_YIELDING)
1692 /*
1693 * Preempting during switching can be bad JUJU
1694 * especially for KSE processes
1695 */
1696 sched_add_internal(td, 0);
1697 else
1698 sched_add_internal(td, 1);
1699 }
1700
1701 static void
1702 sched_add_internal(struct thread *td, int preemptive)
1703 {
1704 struct kseq *kseq;
1705 struct ksegrp *kg;
1706 struct kse *ke;
1707 #ifdef SMP
1708 int canmigrate;
1709 #endif
1710 int class;
1711
1712 mtx_assert(&sched_lock, MA_OWNED);
1713 ke = td->td_kse;
1714 kg = td->td_ksegrp;
1715 if (ke->ke_flags & KEF_ASSIGNED)
1716 return;
1717 kseq = KSEQ_SELF();
1718 KASSERT(ke->ke_state != KES_ONRUNQ,
1719 ("sched_add: kse %p (%s) already in run queue", ke,
1720 ke->ke_proc->p_comm));
1721 KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1722 ("sched_add: process swapped out"));
1723 KASSERT(ke->ke_runq == NULL,
1724 ("sched_add: KSE %p is still assigned to a run queue", ke));
1725
1726 class = PRI_BASE(kg->kg_pri_class);
1727 switch (class) {
1728 case PRI_ITHD:
1729 case PRI_REALTIME:
1730 ke->ke_runq = kseq->ksq_curr;
1731 ke->ke_slice = SCHED_SLICE_MAX;
1732 ke->ke_cpu = PCPU_GET(cpuid);
1733 break;
1734 case PRI_TIMESHARE:
1735 if (SCHED_CURR(kg, ke))
1736 ke->ke_runq = kseq->ksq_curr;
1737 else
1738 ke->ke_runq = kseq->ksq_next;
1739 break;
1740 case PRI_IDLE:
1741 /*
1742 * This is for priority prop.
1743 */
1744 if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
1745 ke->ke_runq = kseq->ksq_curr;
1746 else
1747 ke->ke_runq = &kseq->ksq_idle;
1748 ke->ke_slice = SCHED_SLICE_MIN;
1749 break;
1750 default:
1751 panic("Unknown pri class.");
1752 break;
1753 }
1754 #ifdef SMP
1755 /*
1756 * Don't migrate running threads here. Force the long term balancer
1757 * to do it.
1758 */
1759 canmigrate = KSE_CAN_MIGRATE(ke, class);
1760 if (ke->ke_flags & KEF_HOLD) {
1761 ke->ke_flags &= ~KEF_HOLD;
1762 canmigrate = 0;
1763 }
1764 /*
1765 * If this thread is pinned or bound, notify the target cpu.
1766 */
1767 if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
1768 ke->ke_runq = NULL;
1769 kseq_notify(ke, ke->ke_cpu);
1770 return;
1771 }
1772 /*
1773 * If we had been idle, clear our bit in the group and potentially
1774 * the global bitmap. If not, see if we should transfer this thread.
1775 */
1776 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
1777 (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
1778 /*
1779 * Check to see if our group is unidling, and if so, remove it
1780 * from the global idle mask.
1781 */
1782 if (kseq->ksq_group->ksg_idlemask ==
1783 kseq->ksq_group->ksg_cpumask)
1784 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
1785 /*
1786 * Now remove ourselves from the group specific idle mask.
1787 */
1788 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
1789 } else if (kseq->ksq_load > 1 && canmigrate)
1790 if (kseq_transfer(kseq, ke, class))
1791 return;
1792 ke->ke_cpu = PCPU_GET(cpuid);
1793 #endif
1794 /*
1795 * XXX With preemption this is not necessary.
1796 */
1797 if (td->td_priority < curthread->td_priority &&
1798 ke->ke_runq == kseq->ksq_curr)
1799 curthread->td_flags |= TDF_NEEDRESCHED;
1800 if (preemptive && maybe_preempt(td))
1801 return;
1802 SLOT_USE(td->td_ksegrp);
1803 ke->ke_ksegrp->kg_runq_threads++;
1804 ke->ke_state = KES_ONRUNQ;
1805
1806 kseq_runq_add(kseq, ke);
1807 kseq_load_add(kseq, ke);
1808 }
1809
1810 void
1811 sched_rem(struct thread *td)
1812 {
1813 struct kseq *kseq;
1814 struct kse *ke;
1815
1816 ke = td->td_kse;
1817 /*
1818 * It is safe to just return here because sched_rem() is only ever
1819 * used in places where we're immediately going to add the
1820 * kse back on again. In that case it'll be added with the correct
1821 * thread and priority when the caller drops the sched_lock.
1822 */
1823 if (ke->ke_flags & KEF_ASSIGNED)
1824 return;
1825 mtx_assert(&sched_lock, MA_OWNED);
1826 KASSERT((ke->ke_state == KES_ONRUNQ),
1827 ("sched_rem: KSE not on run queue"));
1828
1829 ke->ke_state = KES_THREAD;
1830 SLOT_RELEASE(td->td_ksegrp);
1831 ke->ke_ksegrp->kg_runq_threads--;
1832 kseq = KSEQ_CPU(ke->ke_cpu);
1833 kseq_runq_rem(kseq, ke);
1834 kseq_load_rem(kseq, ke);
1835 }
1836
1837 fixpt_t
1838 sched_pctcpu(struct thread *td)
1839 {
1840 fixpt_t pctcpu;
1841 struct kse *ke;
1842
1843 pctcpu = 0;
1844 ke = td->td_kse;
1845 if (ke == NULL)
1846 return (0);
1847
1848 mtx_lock_spin(&sched_lock);
1849 if (ke->ke_ticks) {
1850 int rtick;
1851
1852 /*
1853 * Don't update more frequently than twice a second. Allowing
1854 * this causes the cpu usage to decay away too quickly due to
1855 * rounding errors.
1856 */
1857 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
1858 ke->ke_ltick < (ticks - (hz / 2)))
1859 sched_pctcpu_update(ke);
1860 /* How many rtick per second ? */
1861 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1862 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1863 }
1864
1865 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1866 mtx_unlock_spin(&sched_lock);
1867
1868 return (pctcpu);
1869 }
1870
1871 void
1872 sched_bind(struct thread *td, int cpu)
1873 {
1874 struct kse *ke;
1875
1876 mtx_assert(&sched_lock, MA_OWNED);
1877 ke = td->td_kse;
1878 ke->ke_flags |= KEF_BOUND;
1879 #ifdef SMP
1880 if (PCPU_GET(cpuid) == cpu)
1881 return;
1882 /* sched_rem without the runq_remove */
1883 ke->ke_state = KES_THREAD;
1884 ke->ke_ksegrp->kg_runq_threads--;
1885 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
1886 kseq_notify(ke, cpu);
1887 /* When we return from mi_switch we'll be on the correct cpu. */
1888 mi_switch(SW_VOL, NULL);
1889 #endif
1890 }
1891
1892 void
1893 sched_unbind(struct thread *td)
1894 {
1895 mtx_assert(&sched_lock, MA_OWNED);
1896 td->td_kse->ke_flags &= ~KEF_BOUND;
1897 }
1898
1899 int
1900 sched_load(void)
1901 {
1902 #ifdef SMP
1903 int total;
1904 int i;
1905
1906 total = 0;
1907 for (i = 0; i <= ksg_maxid; i++)
1908 total += KSEQ_GROUP(i)->ksg_load;
1909 return (total);
1910 #else
1911 return (KSEQ_SELF()->ksq_sysload);
1912 #endif
1913 }
1914
1915 int
1916 sched_sizeof_ksegrp(void)
1917 {
1918 return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1919 }
1920
1921 int
1922 sched_sizeof_proc(void)
1923 {
1924 return (sizeof(struct proc));
1925 }
1926
1927 int
1928 sched_sizeof_thread(void)
1929 {
1930 return (sizeof(struct thread) + sizeof(struct td_sched));
1931 }
1932 #define KERN_SWITCH_INCLUDE 1
1933 #include "kern/kern_switch.c"
Cache object: de97af17266e660d20df4a97f9fc9ddc
|