FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_synch.c
1 /*-
2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD: releng/5.2/sys/kern/kern_synch.c 121688 2003-10-29 15:23:09Z bde $");
43
44 #include "opt_ddb.h"
45 #include "opt_ktrace.h"
46
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/condvar.h>
50 #include <sys/kernel.h>
51 #include <sys/ktr.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/proc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/sched.h>
57 #include <sys/signalvar.h>
58 #include <sys/smp.h>
59 #include <sys/sx.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysproto.h>
62 #include <sys/vmmeter.h>
63 #ifdef DDB
64 #include <ddb/ddb.h>
65 #endif
66 #ifdef KTRACE
67 #include <sys/uio.h>
68 #include <sys/ktrace.h>
69 #endif
70
71 #include <machine/cpu.h>
72
73 static void sched_setup(void *dummy);
74 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
75
76 int hogticks;
77 int lbolt;
78
79 static struct callout loadav_callout;
80 static struct callout lbolt_callout;
81
82 struct loadavg averunnable =
83 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
84 /*
85 * Constants for averages over 1, 5, and 15 minutes
86 * when sampling at 5 second intervals.
87 */
88 static fixpt_t cexp[3] = {
89 0.9200444146293232 * FSCALE, /* exp(-1/12) */
90 0.9834714538216174 * FSCALE, /* exp(-1/60) */
91 0.9944598480048967 * FSCALE, /* exp(-1/180) */
92 };
93
94 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
95 static int fscale __unused = FSCALE;
96 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
97
98 static void endtsleep(void *);
99 static void loadav(void *arg);
100 static void lboltcb(void *arg);
101
102 /*
103 * We're only looking at 7 bits of the address; everything is
104 * aligned to 4, lots of things are aligned to greater powers
105 * of 2. Shift right by 8, i.e. drop the bottom 256 worth.
106 */
107 #define TABLESIZE 128
108 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE];
109 #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
110
111 void
112 sleepinit(void)
113 {
114 int i;
115
116 hogticks = (hz / 10) * 2; /* Default only. */
117 for (i = 0; i < TABLESIZE; i++)
118 TAILQ_INIT(&slpque[i]);
119 }
120
121 /*
122 * General sleep call. Suspends the current process until a wakeup is
123 * performed on the specified identifier. The process will then be made
124 * runnable with the specified priority. Sleeps at most timo/hz seconds
125 * (0 means no timeout). If pri includes PCATCH flag, signals are checked
126 * before and after sleeping, else signals are not checked. Returns 0 if
127 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
128 * signal needs to be delivered, ERESTART is returned if the current system
129 * call should be restarted if possible, and EINTR is returned if the system
130 * call should be interrupted by the signal (return EINTR).
131 *
132 * The mutex argument is exited before the caller is suspended, and
133 * entered before msleep returns. If priority includes the PDROP
134 * flag the mutex is not entered before returning.
135 */
136
137 int
138 msleep(ident, mtx, priority, wmesg, timo)
139 void *ident;
140 struct mtx *mtx;
141 int priority, timo;
142 const char *wmesg;
143 {
144 struct thread *td = curthread;
145 struct proc *p = td->td_proc;
146 int sig, catch = priority & PCATCH;
147 int rval = 0;
148 WITNESS_SAVE_DECL(mtx);
149
150 #ifdef KTRACE
151 if (KTRPOINT(td, KTR_CSW))
152 ktrcsw(1, 0);
153 #endif
154 /* XXX: mtx == NULL ?? */
155 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mtx->mtx_object,
156 "Sleeping on \"%s\"", wmesg);
157 KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
158 ("sleeping without a mutex"));
159 /*
160 * If we are capable of async syscalls and there isn't already
161 * another one ready to return, start a new thread
162 * and queue it as ready to run. Note that there is danger here
163 * because we need to make sure that we don't sleep allocating
164 * the thread (recursion here might be bad).
165 */
166 mtx_lock_spin(&sched_lock);
167 if (p->p_flag & P_SA || p->p_numthreads > 1) {
168 /*
169 * Just don't bother if we are exiting
170 * and not the exiting thread or thread was marked as
171 * interrupted.
172 */
173 if (catch) {
174 if ((p->p_flag & P_WEXIT) && p->p_singlethread != td) {
175 mtx_unlock_spin(&sched_lock);
176 return (EINTR);
177 }
178 if (td->td_flags & TDF_INTERRUPT) {
179 mtx_unlock_spin(&sched_lock);
180 return (td->td_intrval);
181 }
182 }
183 }
184 if (cold ) {
185 /*
186 * During autoconfiguration, just return;
187 * don't run any other procs or panic below,
188 * in case this is the idle process and already asleep.
189 * XXX: this used to do "s = splhigh(); splx(safepri);
190 * splx(s);" to give interrupts a chance, but there is
191 * no way to give interrupts a chance now.
192 */
193 if (mtx != NULL && priority & PDROP)
194 mtx_unlock(mtx);
195 mtx_unlock_spin(&sched_lock);
196 return (0);
197 }
198 DROP_GIANT();
199 if (mtx != NULL) {
200 mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
201 WITNESS_SAVE(&mtx->mtx_object, mtx);
202 mtx_unlock(mtx);
203 if (priority & PDROP)
204 mtx = NULL;
205 }
206 KASSERT(p != NULL, ("msleep1"));
207 KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
208
209 CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)",
210 td, p->p_pid, p->p_comm, wmesg, ident);
211
212 td->td_wchan = ident;
213 td->td_wmesg = wmesg;
214 TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq);
215 TD_SET_ON_SLEEPQ(td);
216 if (timo)
217 callout_reset(&td->td_slpcallout, timo, endtsleep, td);
218 /*
219 * We put ourselves on the sleep queue and start our timeout
220 * before calling thread_suspend_check, as we could stop there, and
221 * a wakeup or a SIGCONT (or both) could occur while we were stopped.
222 * without resuming us, thus we must be ready for sleep
223 * when cursig is called. If the wakeup happens while we're
224 * stopped, td->td_wchan will be 0 upon return from cursig.
225 */
226 if (catch) {
227 CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td,
228 p->p_pid, p->p_comm);
229 td->td_flags |= TDF_SINTR;
230 mtx_unlock_spin(&sched_lock);
231 PROC_LOCK(p);
232 mtx_lock(&p->p_sigacts->ps_mtx);
233 sig = cursig(td);
234 mtx_unlock(&p->p_sigacts->ps_mtx);
235 if (sig == 0 && thread_suspend_check(1))
236 sig = SIGSTOP;
237 mtx_lock_spin(&sched_lock);
238 PROC_UNLOCK(p);
239 if (sig != 0) {
240 if (TD_ON_SLEEPQ(td))
241 unsleep(td);
242 } else if (!TD_ON_SLEEPQ(td))
243 catch = 0;
244 } else
245 sig = 0;
246
247 /*
248 * Let the scheduler know we're about to voluntarily go to sleep.
249 */
250 sched_sleep(td, priority & PRIMASK);
251
252 if (TD_ON_SLEEPQ(td)) {
253 p->p_stats->p_ru.ru_nvcsw++;
254 TD_SET_SLEEPING(td);
255 mi_switch();
256 }
257 /*
258 * We're awake from voluntary sleep.
259 */
260 CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid,
261 p->p_comm);
262 KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
263 td->td_flags &= ~TDF_SINTR;
264 if (td->td_flags & TDF_TIMEOUT) {
265 td->td_flags &= ~TDF_TIMEOUT;
266 if (sig == 0)
267 rval = EWOULDBLOCK;
268 } else if (td->td_flags & TDF_TIMOFAIL) {
269 td->td_flags &= ~TDF_TIMOFAIL;
270 } else if (timo && callout_stop(&td->td_slpcallout) == 0) {
271 /*
272 * This isn't supposed to be pretty. If we are here, then
273 * the endtsleep() callout is currently executing on another
274 * CPU and is either spinning on the sched_lock or will be
275 * soon. If we don't synchronize here, there is a chance
276 * that this process may msleep() again before the callout
277 * has a chance to run and the callout may end up waking up
278 * the wrong msleep(). Yuck.
279 */
280 TD_SET_SLEEPING(td);
281 p->p_stats->p_ru.ru_nivcsw++;
282 mi_switch();
283 td->td_flags &= ~TDF_TIMOFAIL;
284 }
285 if ((td->td_flags & TDF_INTERRUPT) && (priority & PCATCH) &&
286 (rval == 0)) {
287 rval = td->td_intrval;
288 }
289 mtx_unlock_spin(&sched_lock);
290 if (rval == 0 && catch) {
291 PROC_LOCK(p);
292 /* XXX: shouldn't we always be calling cursig()? */
293 mtx_lock(&p->p_sigacts->ps_mtx);
294 if (sig != 0 || (sig = cursig(td))) {
295 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
296 rval = EINTR;
297 else
298 rval = ERESTART;
299 }
300 mtx_unlock(&p->p_sigacts->ps_mtx);
301 PROC_UNLOCK(p);
302 }
303 #ifdef KTRACE
304 if (KTRPOINT(td, KTR_CSW))
305 ktrcsw(0, 0);
306 #endif
307 PICKUP_GIANT();
308 if (mtx != NULL) {
309 mtx_lock(mtx);
310 WITNESS_RESTORE(&mtx->mtx_object, mtx);
311 }
312 return (rval);
313 }
314
315 /*
316 * Implement timeout for msleep().
317 *
318 * If process hasn't been awakened (wchan non-zero),
319 * set timeout flag and undo the sleep. If proc
320 * is stopped, just unsleep so it will remain stopped.
321 * MP-safe, called without the Giant mutex.
322 */
323 static void
324 endtsleep(arg)
325 void *arg;
326 {
327 register struct thread *td;
328
329 td = (struct thread *)arg;
330 CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)",
331 td, td->td_proc->p_pid, td->td_proc->p_comm);
332 mtx_lock_spin(&sched_lock);
333 /*
334 * This is the other half of the synchronization with msleep()
335 * described above. If the TDS_TIMEOUT flag is set, we lost the
336 * race and just need to put the process back on the runqueue.
337 */
338 if (TD_ON_SLEEPQ(td)) {
339 TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
340 TD_CLR_ON_SLEEPQ(td);
341 td->td_flags |= TDF_TIMEOUT;
342 td->td_wmesg = NULL;
343 } else
344 td->td_flags |= TDF_TIMOFAIL;
345 TD_CLR_SLEEPING(td);
346 setrunnable(td);
347 mtx_unlock_spin(&sched_lock);
348 }
349
350 /*
351 * Abort a thread, as if an interrupt had occured. Only abort
352 * interruptable waits (unfortunatly it isn't only safe to abort others).
353 * This is about identical to cv_abort().
354 * Think about merging them?
355 * Also, whatever the signal code does...
356 */
357 void
358 abortsleep(struct thread *td)
359 {
360
361 mtx_assert(&sched_lock, MA_OWNED);
362 /*
363 * If the TDF_TIMEOUT flag is set, just leave. A
364 * timeout is scheduled anyhow.
365 */
366 if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) {
367 if (TD_ON_SLEEPQ(td)) {
368 unsleep(td);
369 TD_CLR_SLEEPING(td);
370 setrunnable(td);
371 }
372 }
373 }
374
375 /*
376 * Remove a process from its wait queue
377 */
378 void
379 unsleep(struct thread *td)
380 {
381
382 mtx_lock_spin(&sched_lock);
383 if (TD_ON_SLEEPQ(td)) {
384 TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
385 TD_CLR_ON_SLEEPQ(td);
386 td->td_wmesg = NULL;
387 }
388 mtx_unlock_spin(&sched_lock);
389 }
390
391 /*
392 * Make all processes sleeping on the specified identifier runnable.
393 */
394 void
395 wakeup(ident)
396 register void *ident;
397 {
398 register struct slpquehead *qp;
399 register struct thread *td;
400 struct thread *ntd;
401 struct proc *p;
402
403 mtx_lock_spin(&sched_lock);
404 qp = &slpque[LOOKUP(ident)];
405 restart:
406 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
407 ntd = TAILQ_NEXT(td, td_slpq);
408 if (td->td_wchan == ident) {
409 unsleep(td);
410 TD_CLR_SLEEPING(td);
411 setrunnable(td);
412 p = td->td_proc;
413 CTR3(KTR_PROC,"wakeup: thread %p (pid %d, %s)",
414 td, p->p_pid, p->p_comm);
415 goto restart;
416 }
417 }
418 mtx_unlock_spin(&sched_lock);
419 }
420
421 /*
422 * Make a process sleeping on the specified identifier runnable.
423 * May wake more than one process if a target process is currently
424 * swapped out.
425 */
426 void
427 wakeup_one(ident)
428 register void *ident;
429 {
430 register struct proc *p;
431 register struct slpquehead *qp;
432 register struct thread *td;
433 struct thread *ntd;
434
435 mtx_lock_spin(&sched_lock);
436 qp = &slpque[LOOKUP(ident)];
437 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
438 ntd = TAILQ_NEXT(td, td_slpq);
439 if (td->td_wchan == ident) {
440 unsleep(td);
441 TD_CLR_SLEEPING(td);
442 setrunnable(td);
443 p = td->td_proc;
444 CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)",
445 td, p->p_pid, p->p_comm);
446 break;
447 }
448 }
449 mtx_unlock_spin(&sched_lock);
450 }
451
452 /*
453 * The machine independent parts of mi_switch().
454 */
455 void
456 mi_switch(void)
457 {
458 struct bintime new_switchtime;
459 struct thread *td;
460 struct proc *p;
461
462 mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
463 td = curthread; /* XXX */
464 p = td->td_proc; /* XXX */
465 KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
466 #ifdef INVARIANTS
467 if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
468 mtx_assert(&Giant, MA_NOTOWNED);
469 #endif
470 KASSERT(td->td_critnest == 1,
471 ("mi_switch: switch in a critical section"));
472
473 /*
474 * Compute the amount of time during which the current
475 * process was running, and add that to its total so far.
476 */
477 binuptime(&new_switchtime);
478 bintime_add(&p->p_runtime, &new_switchtime);
479 bintime_sub(&p->p_runtime, PCPU_PTR(switchtime));
480
481 td->td_generation++; /* bump preempt-detect counter */
482
483 #ifdef DDB
484 /*
485 * Don't perform context switches from the debugger.
486 */
487 if (db_active) {
488 mtx_unlock_spin(&sched_lock);
489 db_print_backtrace();
490 db_error("Context switches not allowed in the debugger");
491 }
492 #endif
493
494 /*
495 * Check if the process exceeds its cpu resource allocation. If
496 * over max, arrange to kill the process in ast().
497 */
498 if (p->p_cpulimit != RLIM_INFINITY &&
499 p->p_runtime.sec > p->p_cpulimit) {
500 p->p_sflag |= PS_XCPU;
501 td->td_flags |= TDF_ASTPENDING;
502 }
503
504 /*
505 * Finish up stats for outgoing thread.
506 */
507 cnt.v_swtch++;
508 PCPU_SET(switchtime, new_switchtime);
509 PCPU_SET(switchticks, ticks);
510 CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid,
511 p->p_comm);
512 if (td->td_proc->p_flag & P_SA)
513 thread_switchout(td);
514 sched_switch(td);
515
516 CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid,
517 p->p_comm);
518
519 /*
520 * If the last thread was exiting, finish cleaning it up.
521 */
522 if ((td = PCPU_GET(deadthread))) {
523 PCPU_SET(deadthread, NULL);
524 thread_stash(td);
525 }
526 }
527
528 /*
529 * Change process state to be runnable,
530 * placing it on the run queue if it is in memory,
531 * and awakening the swapper if it isn't in memory.
532 */
533 void
534 setrunnable(struct thread *td)
535 {
536 struct proc *p;
537
538 p = td->td_proc;
539 mtx_assert(&sched_lock, MA_OWNED);
540 switch (p->p_state) {
541 case PRS_ZOMBIE:
542 panic("setrunnable(1)");
543 default:
544 break;
545 }
546 switch (td->td_state) {
547 case TDS_RUNNING:
548 case TDS_RUNQ:
549 return;
550 case TDS_INHIBITED:
551 /*
552 * If we are only inhibited because we are swapped out
553 * then arange to swap in this process. Otherwise just return.
554 */
555 if (td->td_inhibitors != TDI_SWAPPED)
556 return;
557 /* XXX: intentional fall-through ? */
558 case TDS_CAN_RUN:
559 break;
560 default:
561 printf("state is 0x%x", td->td_state);
562 panic("setrunnable(2)");
563 }
564 if ((p->p_sflag & PS_INMEM) == 0) {
565 if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
566 p->p_sflag |= PS_SWAPINREQ;
567 wakeup(&proc0);
568 }
569 } else
570 sched_wakeup(td);
571 }
572
573 /*
574 * Compute a tenex style load average of a quantity on
575 * 1, 5 and 15 minute intervals.
576 * XXXKSE Needs complete rewrite when correct info is available.
577 * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
578 */
579 static void
580 loadav(void *arg)
581 {
582 int i, nrun;
583 struct loadavg *avg;
584 struct proc *p;
585 struct thread *td;
586
587 avg = &averunnable;
588 sx_slock(&allproc_lock);
589 nrun = 0;
590 FOREACH_PROC_IN_SYSTEM(p) {
591 FOREACH_THREAD_IN_PROC(p, td) {
592 switch (td->td_state) {
593 case TDS_RUNQ:
594 case TDS_RUNNING:
595 if ((p->p_flag & P_NOLOAD) != 0)
596 goto nextproc;
597 nrun++; /* XXXKSE */
598 default:
599 break;
600 }
601 nextproc:
602 continue;
603 }
604 }
605 sx_sunlock(&allproc_lock);
606 for (i = 0; i < 3; i++)
607 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
608 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
609
610 /*
611 * Schedule the next update to occur after 5 seconds, but add a
612 * random variation to avoid synchronisation with processes that
613 * run at regular intervals.
614 */
615 callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
616 loadav, NULL);
617 }
618
619 static void
620 lboltcb(void *arg)
621 {
622 wakeup(&lbolt);
623 callout_reset(&lbolt_callout, hz, lboltcb, NULL);
624 }
625
626 /* ARGSUSED */
627 static void
628 sched_setup(dummy)
629 void *dummy;
630 {
631 callout_init(&loadav_callout, 0);
632 callout_init(&lbolt_callout, CALLOUT_MPSAFE);
633
634 /* Kick off timeout driven events by calling first time. */
635 loadav(NULL);
636 lboltcb(NULL);
637 }
638
639 /*
640 * General purpose yield system call
641 */
642 int
643 yield(struct thread *td, struct yield_args *uap)
644 {
645 struct ksegrp *kg;
646
647 kg = td->td_ksegrp;
648 mtx_assert(&Giant, MA_NOTOWNED);
649 mtx_lock_spin(&sched_lock);
650 kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
651 sched_prio(td, PRI_MAX_TIMESHARE);
652 mi_switch();
653 mtx_unlock_spin(&sched_lock);
654 td->td_retval[0] = 0;
655 return (0);
656 }
Cache object: 1537bfb4915578dc8d66b956c81ee542
|