FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_synch.c
1 /* $NetBSD: kern_synch.c,v 1.148.2.1 2005/10/21 17:39:40 riz Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 * This code is derived from software contributed to The NetBSD Foundation
11 * by Charles M. Hannum.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed by the NetBSD
24 * Foundation, Inc. and its contributors.
25 * 4. Neither the name of The NetBSD Foundation nor the names of its
26 * contributors may be used to endorse or promote products derived
27 * from this software without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
30 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
31 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
32 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
33 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 * POSSIBILITY OF SUCH DAMAGE.
40 */
41
42 /*-
43 * Copyright (c) 1982, 1986, 1990, 1991, 1993
44 * The Regents of the University of California. All rights reserved.
45 * (c) UNIX System Laboratories, Inc.
46 * All or some portions of this file are derived from material licensed
47 * to the University of California by American Telephone and Telegraph
48 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
49 * the permission of UNIX System Laboratories, Inc.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions
53 * are met:
54 * 1. Redistributions of source code must retain the above copyright
55 * notice, this list of conditions and the following disclaimer.
56 * 2. Redistributions in binary form must reproduce the above copyright
57 * notice, this list of conditions and the following disclaimer in the
58 * documentation and/or other materials provided with the distribution.
59 * 3. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 *
75 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
76 */
77
78 #include <sys/cdefs.h>
79 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.148.2.1 2005/10/21 17:39:40 riz Exp $");
80
81 #include "opt_ddb.h"
82 #include "opt_ktrace.h"
83 #include "opt_kstack.h"
84 #include "opt_lockdebug.h"
85 #include "opt_multiprocessor.h"
86 #include "opt_perfctrs.h"
87
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/callout.h>
91 #include <sys/proc.h>
92 #include <sys/kernel.h>
93 #include <sys/buf.h>
94 #if defined(PERFCTRS)
95 #include <sys/pmc.h>
96 #endif
97 #include <sys/signalvar.h>
98 #include <sys/resourcevar.h>
99 #include <sys/sched.h>
100 #include <sys/sa.h>
101 #include <sys/savar.h>
102
103 #include <uvm/uvm_extern.h>
104
105 #ifdef KTRACE
106 #include <sys/ktrace.h>
107 #endif
108
109 #include <machine/cpu.h>
110
111 int lbolt; /* once a second sleep address */
112 int rrticks; /* number of hardclock ticks per roundrobin() */
113
114 /*
115 * The global scheduler state.
116 */
117 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
118 __volatile u_int32_t sched_whichqs; /* bitmap of non-empty queues */
119 struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */
120
121 struct simplelock sched_lock = SIMPLELOCK_INITIALIZER;
122
123 void schedcpu(void *);
124 void updatepri(struct lwp *);
125 void endtsleep(void *);
126
127 __inline void sa_awaken(struct lwp *);
128 __inline void awaken(struct lwp *);
129
130 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
131
132
133
134 /*
135 * Force switch among equal priority processes every 100ms.
136 * Called from hardclock every hz/10 == rrticks hardclock ticks.
137 */
138 /* ARGSUSED */
139 void
140 roundrobin(struct cpu_info *ci)
141 {
142 struct schedstate_percpu *spc = &ci->ci_schedstate;
143
144 spc->spc_rrticks = rrticks;
145
146 if (curlwp != NULL) {
147 if (spc->spc_flags & SPCF_SEENRR) {
148 /*
149 * The process has already been through a roundrobin
150 * without switching and may be hogging the CPU.
151 * Indicate that the process should yield.
152 */
153 spc->spc_flags |= SPCF_SHOULDYIELD;
154 } else
155 spc->spc_flags |= SPCF_SEENRR;
156 }
157 need_resched(curcpu());
158 }
159
160 /*
161 * Constants for digital decay and forget:
162 * 90% of (p_estcpu) usage in 5 * loadav time
163 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
164 * Note that, as ps(1) mentions, this can let percentages
165 * total over 100% (I've seen 137.9% for 3 processes).
166 *
167 * Note that hardclock updates p_estcpu and p_cpticks independently.
168 *
169 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
170 * That is, the system wants to compute a value of decay such
171 * that the following for loop:
172 * for (i = 0; i < (5 * loadavg); i++)
173 * p_estcpu *= decay;
174 * will compute
175 * p_estcpu *= 0.1;
176 * for all values of loadavg:
177 *
178 * Mathematically this loop can be expressed by saying:
179 * decay ** (5 * loadavg) ~= .1
180 *
181 * The system computes decay as:
182 * decay = (2 * loadavg) / (2 * loadavg + 1)
183 *
184 * We wish to prove that the system's computation of decay
185 * will always fulfill the equation:
186 * decay ** (5 * loadavg) ~= .1
187 *
188 * If we compute b as:
189 * b = 2 * loadavg
190 * then
191 * decay = b / (b + 1)
192 *
193 * We now need to prove two things:
194 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
195 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
196 *
197 * Facts:
198 * For x close to zero, exp(x) =~ 1 + x, since
199 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
200 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
201 * For x close to zero, ln(1+x) =~ x, since
202 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
203 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
204 * ln(.1) =~ -2.30
205 *
206 * Proof of (1):
207 * Solve (factor)**(power) =~ .1 given power (5*loadav):
208 * solving for factor,
209 * ln(factor) =~ (-2.30/5*loadav), or
210 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
211 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
212 *
213 * Proof of (2):
214 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
215 * solving for power,
216 * power*ln(b/(b+1)) =~ -2.30, or
217 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
218 *
219 * Actual power values for the implemented algorithm are as follows:
220 * loadav: 1 2 3 4
221 * power: 5.68 10.32 14.94 19.55
222 */
223
224 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
225 #define loadfactor(loadav) (2 * (loadav))
226 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
227
228 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
229 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
230
231 /*
232 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
233 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
234 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
235 *
236 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
237 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
238 *
239 * If you dont want to bother with the faster/more-accurate formula, you
240 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
241 * (more general) method of calculating the %age of CPU used by a process.
242 */
243 #define CCPU_SHIFT 11
244
245 /*
246 * Recompute process priorities, every hz ticks.
247 */
248 /* ARGSUSED */
249 void
250 schedcpu(void *arg)
251 {
252 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
253 struct lwp *l;
254 struct proc *p;
255 int s, minslp;
256 unsigned int newcpu;
257 int clkhz;
258
259 proclist_lock_read();
260 PROCLIST_FOREACH(p, &allproc) {
261 /*
262 * Increment time in/out of memory and sleep time
263 * (if sleeping). We ignore overflow; with 16-bit int's
264 * (remember them?) overflow takes 45 days.
265 */
266 minslp = 2;
267 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
268 l->l_swtime++;
269 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
270 l->l_stat == LSSUSPENDED) {
271 l->l_slptime++;
272 minslp = min(minslp, l->l_slptime);
273 } else
274 minslp = 0;
275 }
276 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
277 /*
278 * If the process has slept the entire second,
279 * stop recalculating its priority until it wakes up.
280 */
281 if (minslp > 1)
282 continue;
283 s = splstatclock(); /* prevent state changes */
284 /*
285 * p_pctcpu is only for ps.
286 */
287 clkhz = stathz != 0 ? stathz : hz;
288 #if (FSHIFT >= CCPU_SHIFT)
289 p->p_pctcpu += (clkhz == 100)?
290 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
291 100 * (((fixpt_t) p->p_cpticks)
292 << (FSHIFT - CCPU_SHIFT)) / clkhz;
293 #else
294 p->p_pctcpu += ((FSCALE - ccpu) *
295 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
296 #endif
297 p->p_cpticks = 0;
298 newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu);
299 p->p_estcpu = newcpu;
300 splx(s); /* Done with the process CPU ticks update */
301 SCHED_LOCK(s);
302 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
303 if (l->l_slptime > 1)
304 continue;
305 resetpriority(l);
306 if (l->l_priority >= PUSER) {
307 if (l->l_stat == LSRUN &&
308 (l->l_flag & L_INMEM) &&
309 (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) {
310 remrunqueue(l);
311 l->l_priority = l->l_usrpri;
312 setrunqueue(l);
313 } else
314 l->l_priority = l->l_usrpri;
315 }
316 }
317 SCHED_UNLOCK(s);
318 }
319 proclist_unlock_read();
320 uvm_meter();
321 wakeup((caddr_t)&lbolt);
322 callout_schedule(&schedcpu_ch, hz);
323 }
324
325 /*
326 * Recalculate the priority of a process after it has slept for a while.
327 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
328 * least six times the loadfactor will decay p_estcpu to zero.
329 */
330 void
331 updatepri(struct lwp *l)
332 {
333 struct proc *p = l->l_proc;
334 unsigned int newcpu;
335 fixpt_t loadfac;
336
337 SCHED_ASSERT_LOCKED();
338
339 newcpu = p->p_estcpu;
340 loadfac = loadfactor(averunnable.ldavg[0]);
341
342 if (l->l_slptime > 5 * loadfac)
343 p->p_estcpu = 0; /* XXX NJWLWP */
344 else {
345 l->l_slptime--; /* the first time was done in schedcpu */
346 while (newcpu && --l->l_slptime)
347 newcpu = (int) decay_cpu(loadfac, newcpu);
348 p->p_estcpu = newcpu;
349 }
350 resetpriority(l);
351 }
352
353 /*
354 * During autoconfiguration or after a panic, a sleep will simply
355 * lower the priority briefly to allow interrupts, then return.
356 * The priority to be used (safepri) is machine-dependent, thus this
357 * value is initialized and maintained in the machine-dependent layers.
358 * This priority will typically be 0, or the lowest priority
359 * that is safe for use on the interrupt stack; it can be made
360 * higher to block network software interrupts after panics.
361 */
362 int safepri;
363
364 /*
365 * General sleep call. Suspends the current process until a wakeup is
366 * performed on the specified identifier. The process will then be made
367 * runnable with the specified priority. Sleeps at most timo/hz seconds
368 * (0 means no timeout). If pri includes PCATCH flag, signals are checked
369 * before and after sleeping, else signals are not checked. Returns 0 if
370 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
371 * signal needs to be delivered, ERESTART is returned if the current system
372 * call should be restarted if possible, and EINTR is returned if the system
373 * call should be interrupted by the signal (return EINTR).
374 *
375 * The interlock is held until the scheduler_slock is acquired. The
376 * interlock will be locked before returning back to the caller
377 * unless the PNORELOCK flag is specified, in which case the
378 * interlock will always be unlocked upon return.
379 */
380 int
381 ltsleep(const void *ident, int priority, const char *wmesg, int timo,
382 __volatile struct simplelock *interlock)
383 {
384 struct lwp *l = curlwp;
385 struct proc *p = l ? l->l_proc : NULL;
386 struct slpque *qp;
387 struct sadata_upcall *sau;
388 int sig, s;
389 int catch = priority & PCATCH;
390 int relock = (priority & PNORELOCK) == 0;
391 int exiterr = (priority & PNOEXITERR) == 0;
392
393 /*
394 * XXXSMP
395 * This is probably bogus. Figure out what the right
396 * thing to do here really is.
397 * Note that not sleeping if ltsleep is called with curlwp == NULL
398 * in the shutdown case is disgusting but partly necessary given
399 * how shutdown (barely) works.
400 */
401 if (cold || (doing_shutdown && (panicstr || (l == NULL)))) {
402 /*
403 * After a panic, or during autoconfiguration,
404 * just give interrupts a chance, then just return;
405 * don't run any other procs or panic below,
406 * in case this is the idle process and already asleep.
407 */
408 s = splhigh();
409 splx(safepri);
410 splx(s);
411 if (interlock != NULL && relock == 0)
412 simple_unlock(interlock);
413 return (0);
414 }
415
416 KASSERT(p != NULL);
417 LOCK_ASSERT(interlock == NULL || simple_lock_held(interlock));
418
419 #ifdef KTRACE
420 if (KTRPOINT(p, KTR_CSW))
421 ktrcsw(p, 1, 0);
422 #endif
423
424 /*
425 * XXX We need to allocate the sadata_upcall structure here,
426 * XXX since we can't sleep while waiting for memory inside
427 * XXX sa_upcall(). It would be nice if we could safely
428 * XXX allocate the sadata_upcall structure on the stack, here.
429 */
430 if (l->l_flag & L_SA) {
431 sau = sadata_upcall_alloc(0);
432 } else {
433 sau = NULL;
434 }
435
436 SCHED_LOCK(s);
437
438 #ifdef DIAGNOSTIC
439 if (ident == NULL)
440 panic("ltsleep: ident == NULL");
441 if (l->l_stat != LSONPROC)
442 panic("ltsleep: l_stat %d != LSONPROC", l->l_stat);
443 if (l->l_back != NULL)
444 panic("ltsleep: p_back != NULL");
445 #endif
446
447 l->l_wchan = ident;
448 l->l_wmesg = wmesg;
449 l->l_slptime = 0;
450 l->l_priority = priority & PRIMASK;
451
452 qp = SLPQUE(ident);
453 if (qp->sq_head == 0)
454 qp->sq_head = l;
455 else {
456 *qp->sq_tailp = l;
457 }
458 *(qp->sq_tailp = &l->l_forw) = 0;
459
460 if (timo)
461 callout_reset(&l->l_tsleep_ch, timo, endtsleep, l);
462
463 /*
464 * We can now release the interlock; the scheduler_slock
465 * is held, so a thread can't get in to do wakeup() before
466 * we do the switch.
467 *
468 * XXX We leave the code block here, after inserting ourselves
469 * on the sleep queue, because we might want a more clever
470 * data structure for the sleep queues at some point.
471 */
472 if (interlock != NULL)
473 simple_unlock(interlock);
474
475 /*
476 * We put ourselves on the sleep queue and start our timeout
477 * before calling CURSIG, as we could stop there, and a wakeup
478 * or a SIGCONT (or both) could occur while we were stopped.
479 * A SIGCONT would cause us to be marked as SSLEEP
480 * without resuming us, thus we must be ready for sleep
481 * when CURSIG is called. If the wakeup happens while we're
482 * stopped, p->p_wchan will be 0 upon return from CURSIG.
483 */
484 if (catch) {
485 l->l_flag |= L_SINTR;
486 if (((sig = CURSIG(l)) != 0) ||
487 ((p->p_flag & P_WEXIT) && p->p_nlwps > 1)) {
488 if (l->l_wchan != NULL)
489 unsleep(l);
490 l->l_stat = LSONPROC;
491 SCHED_UNLOCK(s);
492 goto resume;
493 }
494 if (l->l_wchan == NULL) {
495 catch = 0;
496 SCHED_UNLOCK(s);
497 goto resume;
498 }
499 } else
500 sig = 0;
501 l->l_stat = LSSLEEP;
502 p->p_nrlwps--;
503 p->p_stats->p_ru.ru_nvcsw++;
504 SCHED_ASSERT_LOCKED();
505 if (l->l_flag & L_SA)
506 sa_switch(l, sau, SA_UPCALL_BLOCKED);
507 else
508 mi_switch(l, NULL);
509
510 #if defined(DDB) && !defined(GPROF)
511 /* handy breakpoint location after process "wakes" */
512 __asm(".globl bpendtsleep\nbpendtsleep:");
513 #endif
514 /*
515 * p->p_nrlwps is incremented by whoever made us runnable again,
516 * either setrunnable() or awaken().
517 */
518
519 SCHED_ASSERT_UNLOCKED();
520 splx(s);
521
522 resume:
523 KDASSERT(l->l_cpu != NULL);
524 KDASSERT(l->l_cpu == curcpu());
525 l->l_cpu->ci_schedstate.spc_curpriority = l->l_usrpri;
526
527 l->l_flag &= ~L_SINTR;
528 if (l->l_flag & L_TIMEOUT) {
529 l->l_flag &= ~(L_TIMEOUT|L_CANCELLED);
530 if (sig == 0) {
531 #ifdef KTRACE
532 if (KTRPOINT(p, KTR_CSW))
533 ktrcsw(p, 0, 0);
534 #endif
535 if (relock && interlock != NULL)
536 simple_lock(interlock);
537 return (EWOULDBLOCK);
538 }
539 } else if (timo)
540 callout_stop(&l->l_tsleep_ch);
541
542 if (catch) {
543 const int cancelled = l->l_flag & L_CANCELLED;
544 l->l_flag &= ~L_CANCELLED;
545 if (sig != 0 || (sig = CURSIG(l)) != 0 || cancelled) {
546 #ifdef KTRACE
547 if (KTRPOINT(p, KTR_CSW))
548 ktrcsw(p, 0, 0);
549 #endif
550 if (relock && interlock != NULL)
551 simple_lock(interlock);
552 /*
553 * If this sleep was canceled, don't let the syscall
554 * restart.
555 */
556 if (cancelled ||
557 (SIGACTION(p, sig).sa_flags & SA_RESTART) == 0)
558 return (EINTR);
559 return (ERESTART);
560 }
561 }
562
563 #ifdef KTRACE
564 if (KTRPOINT(p, KTR_CSW))
565 ktrcsw(p, 0, 0);
566 #endif
567 if (relock && interlock != NULL)
568 simple_lock(interlock);
569
570 /* XXXNJW this is very much a kluge.
571 * revisit. a better way of preventing looping/hanging syscalls like
572 * wait4() and _lwp_wait() from wedging an exiting process
573 * would be preferred.
574 */
575 if (catch && ((p->p_flag & P_WEXIT) && p->p_nlwps > 1 && exiterr))
576 return (EINTR);
577 return (0);
578 }
579
580 /*
581 * Implement timeout for tsleep.
582 * If process hasn't been awakened (wchan non-zero),
583 * set timeout flag and undo the sleep. If proc
584 * is stopped, just unsleep so it will remain stopped.
585 */
586 void
587 endtsleep(void *arg)
588 {
589 struct lwp *l;
590 int s;
591
592 l = (struct lwp *)arg;
593 SCHED_LOCK(s);
594 if (l->l_wchan) {
595 if (l->l_stat == LSSLEEP)
596 setrunnable(l);
597 else
598 unsleep(l);
599 l->l_flag |= L_TIMEOUT;
600 }
601 SCHED_UNLOCK(s);
602 }
603
604 /*
605 * Remove a process from its wait queue
606 */
607 void
608 unsleep(struct lwp *l)
609 {
610 struct slpque *qp;
611 struct lwp **hp;
612
613 SCHED_ASSERT_LOCKED();
614
615 if (l->l_wchan) {
616 hp = &(qp = SLPQUE(l->l_wchan))->sq_head;
617 while (*hp != l)
618 hp = &(*hp)->l_forw;
619 *hp = l->l_forw;
620 if (qp->sq_tailp == &l->l_forw)
621 qp->sq_tailp = hp;
622 l->l_wchan = 0;
623 }
624 }
625
626 __inline void
627 sa_awaken(struct lwp *l)
628 {
629
630 SCHED_ASSERT_LOCKED();
631
632 if (l == l->l_savp->savp_lwp && l->l_flag & L_SA_YIELD)
633 l->l_flag &= ~L_SA_IDLE;
634 }
635
636 /*
637 * Optimized-for-wakeup() version of setrunnable().
638 */
639 __inline void
640 awaken(struct lwp *l)
641 {
642
643 SCHED_ASSERT_LOCKED();
644
645 if (l->l_proc->p_sa)
646 sa_awaken(l);
647
648 if (l->l_slptime > 1)
649 updatepri(l);
650 l->l_slptime = 0;
651 l->l_stat = LSRUN;
652 l->l_proc->p_nrlwps++;
653 /*
654 * Since curpriority is a user priority, p->p_priority
655 * is always better than curpriority on the last CPU on
656 * which it ran.
657 *
658 * XXXSMP See affinity comment in resched_proc().
659 */
660 if (l->l_flag & L_INMEM) {
661 setrunqueue(l);
662 KASSERT(l->l_cpu != NULL);
663 need_resched(l->l_cpu);
664 } else
665 sched_wakeup(&proc0);
666 }
667
668 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
669 void
670 sched_unlock_idle(void)
671 {
672
673 simple_unlock(&sched_lock);
674 }
675
676 void
677 sched_lock_idle(void)
678 {
679
680 simple_lock(&sched_lock);
681 }
682 #endif /* MULTIPROCESSOR || LOCKDEBUG */
683
684 /*
685 * Make all processes sleeping on the specified identifier runnable.
686 */
687
688 void
689 wakeup(const void *ident)
690 {
691 int s;
692
693 SCHED_ASSERT_UNLOCKED();
694
695 SCHED_LOCK(s);
696 sched_wakeup(ident);
697 SCHED_UNLOCK(s);
698 }
699
700 void
701 sched_wakeup(const void *ident)
702 {
703 struct slpque *qp;
704 struct lwp *l, **q;
705
706 SCHED_ASSERT_LOCKED();
707
708 qp = SLPQUE(ident);
709 restart:
710 for (q = &qp->sq_head; (l = *q) != NULL; ) {
711 #ifdef DIAGNOSTIC
712 if (l->l_back || (l->l_stat != LSSLEEP &&
713 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED))
714 panic("wakeup");
715 #endif
716 if (l->l_wchan == ident) {
717 l->l_wchan = 0;
718 *q = l->l_forw;
719 if (qp->sq_tailp == &l->l_forw)
720 qp->sq_tailp = q;
721 if (l->l_stat == LSSLEEP) {
722 awaken(l);
723 goto restart;
724 }
725 } else
726 q = &l->l_forw;
727 }
728 }
729
730 /*
731 * Make the highest priority process first in line on the specified
732 * identifier runnable.
733 */
734 void
735 wakeup_one(const void *ident)
736 {
737 struct slpque *qp;
738 struct lwp *l, **q;
739 struct lwp *best_sleepp, **best_sleepq;
740 struct lwp *best_stopp, **best_stopq;
741 int s;
742
743 best_sleepp = best_stopp = NULL;
744 best_sleepq = best_stopq = NULL;
745
746 SCHED_LOCK(s);
747
748 qp = SLPQUE(ident);
749
750 for (q = &qp->sq_head; (l = *q) != NULL; q = &l->l_forw) {
751 #ifdef DIAGNOSTIC
752 if (l->l_back || (l->l_stat != LSSLEEP &&
753 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED))
754 panic("wakeup_one");
755 #endif
756 if (l->l_wchan == ident) {
757 if (l->l_stat == LSSLEEP) {
758 if (best_sleepp == NULL ||
759 l->l_priority < best_sleepp->l_priority) {
760 best_sleepp = l;
761 best_sleepq = q;
762 }
763 } else {
764 if (best_stopp == NULL ||
765 l->l_priority < best_stopp->l_priority) {
766 best_stopp = l;
767 best_stopq = q;
768 }
769 }
770 }
771 }
772
773 /*
774 * Consider any SSLEEP process higher than the highest priority SSTOP
775 * process.
776 */
777 if (best_sleepp != NULL) {
778 l = best_sleepp;
779 q = best_sleepq;
780 } else {
781 l = best_stopp;
782 q = best_stopq;
783 }
784
785 if (l != NULL) {
786 l->l_wchan = NULL;
787 *q = l->l_forw;
788 if (qp->sq_tailp == &l->l_forw)
789 qp->sq_tailp = q;
790 if (l->l_stat == LSSLEEP)
791 awaken(l);
792 }
793 SCHED_UNLOCK(s);
794 }
795
796 /*
797 * General yield call. Puts the current process back on its run queue and
798 * performs a voluntary context switch. Should only be called when the
799 * current process explicitly requests it (eg sched_yield(2) in compat code).
800 */
801 void
802 yield(void)
803 {
804 struct lwp *l = curlwp;
805 int s;
806
807 SCHED_LOCK(s);
808 l->l_priority = l->l_usrpri;
809 l->l_stat = LSRUN;
810 setrunqueue(l);
811 l->l_proc->p_stats->p_ru.ru_nvcsw++;
812 mi_switch(l, NULL);
813 SCHED_ASSERT_UNLOCKED();
814 splx(s);
815 }
816
817 /*
818 * General preemption call. Puts the current process back on its run queue
819 * and performs an involuntary context switch. If a process is supplied,
820 * we switch to that process. Otherwise, we use the normal process selection
821 * criteria.
822 */
823
824 void
825 preempt(int more)
826 {
827 struct lwp *l = curlwp;
828 int r, s;
829
830 SCHED_LOCK(s);
831 l->l_priority = l->l_usrpri;
832 l->l_stat = LSRUN;
833 setrunqueue(l);
834 l->l_proc->p_stats->p_ru.ru_nivcsw++;
835 r = mi_switch(l, NULL);
836 SCHED_ASSERT_UNLOCKED();
837 splx(s);
838 if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0)
839 sa_preempt(l);
840 }
841
842 /*
843 * The machine independent parts of context switch.
844 * Must be called at splsched() (no higher!) and with
845 * the sched_lock held.
846 * Switch to "new" if non-NULL, otherwise let cpu_switch choose
847 * the next lwp.
848 *
849 * Returns 1 if another process was actually run.
850 */
851 int
852 mi_switch(struct lwp *l, struct lwp *newl)
853 {
854 struct schedstate_percpu *spc;
855 struct rlimit *rlim;
856 long s, u;
857 struct timeval tv;
858 int hold_count;
859 struct proc *p = l->l_proc;
860 int retval;
861
862 SCHED_ASSERT_LOCKED();
863
864 /*
865 * Release the kernel_lock, as we are about to yield the CPU.
866 * The scheduler lock is still held until cpu_switch()
867 * selects a new process and removes it from the run queue.
868 */
869 hold_count = KERNEL_LOCK_RELEASE_ALL();
870
871 KDASSERT(l->l_cpu != NULL);
872 KDASSERT(l->l_cpu == curcpu());
873
874 spc = &l->l_cpu->ci_schedstate;
875
876 #if defined(LOCKDEBUG) || defined(DIAGNOSTIC)
877 spinlock_switchcheck();
878 #endif
879 #ifdef LOCKDEBUG
880 simple_lock_switchcheck();
881 #endif
882
883 /*
884 * Compute the amount of time during which the current
885 * process was running.
886 */
887 microtime(&tv);
888 u = p->p_rtime.tv_usec +
889 (tv.tv_usec - spc->spc_runtime.tv_usec);
890 s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
891 if (u < 0) {
892 u += 1000000;
893 s--;
894 } else if (u >= 1000000) {
895 u -= 1000000;
896 s++;
897 }
898 p->p_rtime.tv_usec = u;
899 p->p_rtime.tv_sec = s;
900
901 /*
902 * Check if the process exceeds its CPU resource allocation.
903 * If over max, kill it. In any case, if it has run for more
904 * than 10 minutes, reduce priority to give others a chance.
905 */
906 rlim = &p->p_rlimit[RLIMIT_CPU];
907 if (s >= rlim->rlim_cur) {
908 /*
909 * XXXSMP: we're inside the scheduler lock perimeter;
910 * use sched_psignal.
911 */
912 if (s >= rlim->rlim_max)
913 sched_psignal(p, SIGKILL);
914 else {
915 sched_psignal(p, SIGXCPU);
916 if (rlim->rlim_cur < rlim->rlim_max)
917 rlim->rlim_cur += 5;
918 }
919 }
920 if (autonicetime && s > autonicetime && p->p_ucred->cr_uid &&
921 p->p_nice == NZERO) {
922 p->p_nice = autoniceval + NZERO;
923 resetpriority(l);
924 }
925
926 /*
927 * Process is about to yield the CPU; clear the appropriate
928 * scheduling flags.
929 */
930 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
931
932 #ifdef KSTACK_CHECK_MAGIC
933 kstack_check_magic(l);
934 #endif
935
936 /*
937 * If we are using h/w performance counters, save context.
938 */
939 #if PERFCTRS
940 if (PMC_ENABLED(p))
941 pmc_save_context(p);
942 #endif
943
944 /*
945 * Switch to the new current process. When we
946 * run again, we'll return back here.
947 */
948 uvmexp.swtch++;
949 if (newl == NULL) {
950 retval = cpu_switch(l, NULL);
951 } else {
952 remrunqueue(newl);
953 cpu_switchto(l, newl);
954 retval = 0;
955 }
956
957 /*
958 * If we are using h/w performance counters, restore context.
959 */
960 #if PERFCTRS
961 if (PMC_ENABLED(p))
962 pmc_restore_context(p);
963 #endif
964
965 /*
966 * Make sure that MD code released the scheduler lock before
967 * resuming us.
968 */
969 SCHED_ASSERT_UNLOCKED();
970
971 /*
972 * We're running again; record our new start time. We might
973 * be running on a new CPU now, so don't use the cache'd
974 * schedstate_percpu pointer.
975 */
976 KDASSERT(l->l_cpu != NULL);
977 KDASSERT(l->l_cpu == curcpu());
978 microtime(&l->l_cpu->ci_schedstate.spc_runtime);
979
980 /*
981 * Reacquire the kernel_lock now. We do this after we've
982 * released the scheduler lock to avoid deadlock, and before
983 * we reacquire the interlock.
984 */
985 KERNEL_LOCK_ACQUIRE_COUNT(hold_count);
986
987 return retval;
988 }
989
990 /*
991 * Initialize the (doubly-linked) run queues
992 * to be empty.
993 */
994 void
995 rqinit()
996 {
997 int i;
998
999 for (i = 0; i < RUNQUE_NQS; i++)
1000 sched_qs[i].ph_link = sched_qs[i].ph_rlink =
1001 (struct lwp *)&sched_qs[i];
1002 }
1003
1004 static __inline void
1005 resched_proc(struct lwp *l, u_char pri)
1006 {
1007 struct cpu_info *ci;
1008
1009 /*
1010 * XXXSMP
1011 * Since l->l_cpu persists across a context switch,
1012 * this gives us *very weak* processor affinity, in
1013 * that we notify the CPU on which the process last
1014 * ran that it should try to switch.
1015 *
1016 * This does not guarantee that the process will run on
1017 * that processor next, because another processor might
1018 * grab it the next time it performs a context switch.
1019 *
1020 * This also does not handle the case where its last
1021 * CPU is running a higher-priority process, but every
1022 * other CPU is running a lower-priority process. There
1023 * are ways to handle this situation, but they're not
1024 * currently very pretty, and we also need to weigh the
1025 * cost of moving a process from one CPU to another.
1026 *
1027 * XXXSMP
1028 * There is also the issue of locking the other CPU's
1029 * sched state, which we currently do not do.
1030 */
1031 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
1032 if (pri < ci->ci_schedstate.spc_curpriority)
1033 need_resched(ci);
1034 }
1035
1036 /*
1037 * Change process state to be runnable,
1038 * placing it on the run queue if it is in memory,
1039 * and awakening the swapper if it isn't in memory.
1040 */
1041 void
1042 setrunnable(struct lwp *l)
1043 {
1044 struct proc *p = l->l_proc;
1045
1046 SCHED_ASSERT_LOCKED();
1047
1048 switch (l->l_stat) {
1049 case 0:
1050 case LSRUN:
1051 case LSONPROC:
1052 case LSZOMB:
1053 case LSDEAD:
1054 default:
1055 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
1056 case LSSTOP:
1057 /*
1058 * If we're being traced (possibly because someone attached us
1059 * while we were stopped), check for a signal from the debugger.
1060 */
1061 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) {
1062 sigaddset(&p->p_sigctx.ps_siglist, p->p_xstat);
1063 CHECKSIGS(p);
1064 }
1065 case LSSLEEP:
1066 unsleep(l); /* e.g. when sending signals */
1067 break;
1068
1069 case LSIDL:
1070 break;
1071 case LSSUSPENDED:
1072 break;
1073 }
1074
1075 if (l->l_proc->p_sa)
1076 sa_awaken(l);
1077
1078 l->l_stat = LSRUN;
1079 p->p_nrlwps++;
1080
1081 if (l->l_flag & L_INMEM)
1082 setrunqueue(l);
1083
1084 if (l->l_slptime > 1)
1085 updatepri(l);
1086 l->l_slptime = 0;
1087 if ((l->l_flag & L_INMEM) == 0)
1088 sched_wakeup((caddr_t)&proc0);
1089 else
1090 resched_proc(l, l->l_priority);
1091 }
1092
1093 /*
1094 * Compute the priority of a process when running in user mode.
1095 * Arrange to reschedule if the resulting priority is better
1096 * than that of the current process.
1097 */
1098 void
1099 resetpriority(struct lwp *l)
1100 {
1101 unsigned int newpriority;
1102 struct proc *p = l->l_proc;
1103
1104 SCHED_ASSERT_LOCKED();
1105
1106 newpriority = PUSER + p->p_estcpu +
1107 NICE_WEIGHT * (p->p_nice - NZERO);
1108 newpriority = min(newpriority, MAXPRI);
1109 l->l_usrpri = newpriority;
1110 resched_proc(l, l->l_usrpri);
1111 }
1112
1113 /*
1114 * Recompute priority for all LWPs in a process.
1115 */
1116 void
1117 resetprocpriority(struct proc *p)
1118 {
1119 struct lwp *l;
1120
1121 LIST_FOREACH(l, &p->p_lwps, l_sibling)
1122 resetpriority(l);
1123 }
1124
1125 /*
1126 * We adjust the priority of the current process. The priority of a process
1127 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
1128 * is increased here. The formula for computing priorities (in kern_synch.c)
1129 * will compute a different value each time p_estcpu increases. This can
1130 * cause a switch, but unless the priority crosses a PPQ boundary the actual
1131 * queue will not change. The CPU usage estimator ramps up quite quickly
1132 * when the process is running (linearly), and decays away exponentially, at
1133 * a rate which is proportionally slower when the system is busy. The basic
1134 * principle is that the system will 90% forget that the process used a lot
1135 * of CPU time in 5 * loadav seconds. This causes the system to favor
1136 * processes which haven't run much recently, and to round-robin among other
1137 * processes.
1138 */
1139
1140 void
1141 schedclock(struct lwp *l)
1142 {
1143 struct proc *p = l->l_proc;
1144 int s;
1145
1146 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1147 SCHED_LOCK(s);
1148 resetpriority(l);
1149 SCHED_UNLOCK(s);
1150
1151 if (l->l_priority >= PUSER)
1152 l->l_priority = l->l_usrpri;
1153 }
1154
1155 void
1156 suspendsched()
1157 {
1158 struct lwp *l;
1159 int s;
1160
1161 /*
1162 * Convert all non-P_SYSTEM LSSLEEP or LSRUN processes to
1163 * LSSUSPENDED.
1164 */
1165 proclist_lock_read();
1166 SCHED_LOCK(s);
1167 LIST_FOREACH(l, &alllwp, l_list) {
1168 if ((l->l_proc->p_flag & P_SYSTEM) != 0)
1169 continue;
1170
1171 switch (l->l_stat) {
1172 case LSRUN:
1173 l->l_proc->p_nrlwps--;
1174 if ((l->l_flag & L_INMEM) != 0)
1175 remrunqueue(l);
1176 /* FALLTHROUGH */
1177 case LSSLEEP:
1178 l->l_stat = LSSUSPENDED;
1179 break;
1180 case LSONPROC:
1181 /*
1182 * XXX SMP: we need to deal with processes on
1183 * others CPU !
1184 */
1185 break;
1186 default:
1187 break;
1188 }
1189 }
1190 SCHED_UNLOCK(s);
1191 proclist_unlock_read();
1192 }
1193
1194 /*
1195 * Low-level routines to access the run queue. Optimised assembler
1196 * routines can override these.
1197 */
1198
1199 #ifndef __HAVE_MD_RUNQUEUE
1200
1201 /*
1202 * On some architectures, it's faster to use a MSB ordering for the priorites
1203 * than the traditional LSB ordering.
1204 */
1205 #ifdef __HAVE_BIGENDIAN_BITOPS
1206 #define RQMASK(n) (0x80000000 >> (n))
1207 #else
1208 #define RQMASK(n) (0x00000001 << (n))
1209 #endif
1210
1211 /*
1212 * The primitives that manipulate the run queues. whichqs tells which
1213 * of the 32 queues qs have processes in them. Setrunqueue puts processes
1214 * into queues, remrunqueue removes them from queues. The running process is
1215 * on no queue, other processes are on a queue related to p->p_priority,
1216 * divided by 4 actually to shrink the 0-127 range of priorities into the 32
1217 * available queues.
1218 */
1219
1220 #ifdef RQDEBUG
1221 static void
1222 checkrunqueue(int whichq, struct lwp *l)
1223 {
1224 const struct prochd * const rq = &sched_qs[whichq];
1225 struct lwp *l2;
1226 int found = 0;
1227 int die = 0;
1228 int empty = 1;
1229 for (l2 = rq->ph_link; l2 != (void*) rq; l2 = l2->l_forw) {
1230 if (l2->l_stat != LSRUN) {
1231 printf("checkrunqueue[%d]: lwp %p state (%d) "
1232 " != LSRUN\n", whichq, l2, l2->l_stat);
1233 }
1234 if (l2->l_back->l_forw != l2) {
1235 printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
1236 "corrupt %p\n", whichq, l2, l2->l_back,
1237 l2->l_back->l_forw);
1238 die = 1;
1239 }
1240 if (l2->l_forw->l_back != l2) {
1241 printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
1242 "corrupt %p\n", whichq, l2, l2->l_forw,
1243 l2->l_forw->l_back);
1244 die = 1;
1245 }
1246 if (l2 == l)
1247 found = 1;
1248 empty = 0;
1249 }
1250 if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
1251 printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
1252 whichq, rq);
1253 die = 1;
1254 } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
1255 printf("checkrunqueue[%d]: bit clear for non-empty "
1256 "run-queue %p\n", whichq, rq);
1257 die = 1;
1258 }
1259 if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
1260 printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
1261 whichq, l);
1262 die = 1;
1263 }
1264 if (l != NULL && empty) {
1265 printf("checkrunqueue[%d]: empty run-queue %p with "
1266 "active lwp %p\n", whichq, rq, l);
1267 die = 1;
1268 }
1269 if (l != NULL && !found) {
1270 printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
1271 whichq, l, rq);
1272 die = 1;
1273 }
1274 if (die)
1275 panic("checkrunqueue: inconsistency found");
1276 }
1277 #endif /* RQDEBUG */
1278
1279 void
1280 setrunqueue(struct lwp *l)
1281 {
1282 struct prochd *rq;
1283 struct lwp *prev;
1284 const int whichq = l->l_priority / 4;
1285
1286 #ifdef RQDEBUG
1287 checkrunqueue(whichq, NULL);
1288 #endif
1289 #ifdef DIAGNOSTIC
1290 if (l->l_back != NULL || l->l_wchan != NULL || l->l_stat != LSRUN)
1291 panic("setrunqueue");
1292 #endif
1293 sched_whichqs |= RQMASK(whichq);
1294 rq = &sched_qs[whichq];
1295 prev = rq->ph_rlink;
1296 l->l_forw = (struct lwp *)rq;
1297 rq->ph_rlink = l;
1298 prev->l_forw = l;
1299 l->l_back = prev;
1300 #ifdef RQDEBUG
1301 checkrunqueue(whichq, l);
1302 #endif
1303 }
1304
1305 void
1306 remrunqueue(struct lwp *l)
1307 {
1308 struct lwp *prev, *next;
1309 const int whichq = l->l_priority / 4;
1310 #ifdef RQDEBUG
1311 checkrunqueue(whichq, l);
1312 #endif
1313 #ifdef DIAGNOSTIC
1314 if (((sched_whichqs & RQMASK(whichq)) == 0))
1315 panic("remrunqueue: bit %d not set", whichq);
1316 #endif
1317 prev = l->l_back;
1318 l->l_back = NULL;
1319 next = l->l_forw;
1320 prev->l_forw = next;
1321 next->l_back = prev;
1322 if (prev == next)
1323 sched_whichqs &= ~RQMASK(whichq);
1324 #ifdef RQDEBUG
1325 checkrunqueue(whichq, NULL);
1326 #endif
1327 }
1328
1329 #undef RQMASK
1330 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
Cache object: 10204a48d7ae3b9c236ce8fcfff3e891
|