FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_pageout.c
1 /*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
41 *
42 *
43 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
44 * All rights reserved.
45 *
46 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
47 *
48 * Permission to use, copy, modify and distribute this software and
49 * its documentation is hereby granted, provided that both the copyright
50 * notice and this permission notice appear in all copies of the
51 * software, derivative works or modified versions, and any portions
52 * thereof, and that both notices appear in supporting documentation.
53 *
54 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
55 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
56 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
57 *
58 * Carnegie Mellon requests users of this software to return to
59 *
60 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
61 * School of Computer Science
62 * Carnegie Mellon University
63 * Pittsburgh PA 15213-3890
64 *
65 * any improvements or extensions that they make and grant Carnegie the
66 * rights to redistribute these changes.
67 *
68 * $FreeBSD: releng/5.0/sys/vm/vm_pageout.c 107436 2002-12-01 05:40:18Z alc $
69 */
70
71 /*
72 * The proverbial page-out daemon.
73 */
74
75 #include "opt_vm.h"
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/kernel.h>
79 #include <sys/eventhandler.h>
80 #include <sys/lock.h>
81 #include <sys/mutex.h>
82 #include <sys/proc.h>
83 #include <sys/kthread.h>
84 #include <sys/ktr.h>
85 #include <sys/resourcevar.h>
86 #include <sys/sched.h>
87 #include <sys/signalvar.h>
88 #include <sys/vnode.h>
89 #include <sys/vmmeter.h>
90 #include <sys/sx.h>
91 #include <sys/sysctl.h>
92
93 #include <vm/vm.h>
94 #include <vm/vm_param.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_pageout.h>
99 #include <vm/vm_pager.h>
100 #include <vm/swap_pager.h>
101 #include <vm/vm_extern.h>
102 #include <vm/uma.h>
103
104 #include <machine/mutex.h>
105
106 /*
107 * System initialization
108 */
109
110 /* the kernel process "vm_pageout"*/
111 static void vm_pageout(void);
112 static int vm_pageout_clean(vm_page_t);
113 static void vm_pageout_pmap_collect(void);
114 static void vm_pageout_scan(int pass);
115 static int vm_pageout_free_page_calc(vm_size_t count);
116 struct proc *pageproc;
117
118 static struct kproc_desc page_kp = {
119 "pagedaemon",
120 vm_pageout,
121 &pageproc
122 };
123 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
124
125 #if !defined(NO_SWAPPING)
126 /* the kernel process "vm_daemon"*/
127 static void vm_daemon(void);
128 static struct proc *vmproc;
129
130 static struct kproc_desc vm_kp = {
131 "vmdaemon",
132 vm_daemon,
133 &vmproc
134 };
135 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
136 #endif
137
138
139 int vm_pages_needed=0; /* Event on which pageout daemon sleeps */
140 int vm_pageout_deficit=0; /* Estimated number of pages deficit */
141 int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */
142
143 #if !defined(NO_SWAPPING)
144 static int vm_pageout_req_swapout; /* XXX */
145 static int vm_daemon_needed;
146 #endif
147 extern int vm_swap_size;
148 static int vm_max_launder = 32;
149 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
150 static int vm_pageout_full_stats_interval = 0;
151 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
152 static int defer_swap_pageouts=0;
153 static int disable_swap_pageouts=0;
154
155 #if defined(NO_SWAPPING)
156 static int vm_swap_enabled=0;
157 static int vm_swap_idle_enabled=0;
158 #else
159 static int vm_swap_enabled=1;
160 static int vm_swap_idle_enabled=0;
161 #endif
162
163 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
164 CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
165
166 SYSCTL_INT(_vm, OID_AUTO, max_launder,
167 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
168
169 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
170 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
171
172 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
173 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
174
175 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
176 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
177
178 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
179 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
180
181 #if defined(NO_SWAPPING)
182 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
183 CTLFLAG_RD, &vm_swap_enabled, 0, "");
184 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
185 CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
186 #else
187 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
188 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
189 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
190 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
191 #endif
192
193 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
194 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
195
196 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
197 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
198
199 static int pageout_lock_miss;
200 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
201 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
202
203 #define VM_PAGEOUT_PAGE_COUNT 16
204 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
205
206 int vm_page_max_wired; /* XXX max # of wired pages system-wide */
207
208 #if !defined(NO_SWAPPING)
209 typedef void freeer_fcn_t(vm_map_t, vm_object_t, vm_pindex_t, int);
210 static void vm_pageout_map_deactivate_pages(vm_map_t, vm_pindex_t);
211 static freeer_fcn_t vm_pageout_object_deactivate_pages;
212 static void vm_req_vmdaemon(void);
213 #endif
214 static void vm_pageout_page_stats(void);
215
216 /*
217 * vm_pageout_clean:
218 *
219 * Clean the page and remove it from the laundry.
220 *
221 * We set the busy bit to cause potential page faults on this page to
222 * block. Note the careful timing, however, the busy bit isn't set till
223 * late and we cannot do anything that will mess with the page.
224 */
225 static int
226 vm_pageout_clean(m)
227 vm_page_t m;
228 {
229 vm_object_t object;
230 vm_page_t mc[2*vm_pageout_page_count];
231 int pageout_count;
232 int ib, is, page_base;
233 vm_pindex_t pindex = m->pindex;
234
235 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
236
237 object = m->object;
238
239 /*
240 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
241 * with the new swapper, but we could have serious problems paging
242 * out other object types if there is insufficient memory.
243 *
244 * Unfortunately, checking free memory here is far too late, so the
245 * check has been moved up a procedural level.
246 */
247
248 /*
249 * Don't mess with the page if it's busy, held, or special
250 */
251 if ((m->hold_count != 0) ||
252 ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
253 return 0;
254 }
255
256 mc[vm_pageout_page_count] = m;
257 pageout_count = 1;
258 page_base = vm_pageout_page_count;
259 ib = 1;
260 is = 1;
261
262 /*
263 * Scan object for clusterable pages.
264 *
265 * We can cluster ONLY if: ->> the page is NOT
266 * clean, wired, busy, held, or mapped into a
267 * buffer, and one of the following:
268 * 1) The page is inactive, or a seldom used
269 * active page.
270 * -or-
271 * 2) we force the issue.
272 *
273 * During heavy mmap/modification loads the pageout
274 * daemon can really fragment the underlying file
275 * due to flushing pages out of order and not trying
276 * align the clusters (which leave sporatic out-of-order
277 * holes). To solve this problem we do the reverse scan
278 * first and attempt to align our cluster, then do a
279 * forward scan if room remains.
280 */
281 more:
282 while (ib && pageout_count < vm_pageout_page_count) {
283 vm_page_t p;
284
285 if (ib > pindex) {
286 ib = 0;
287 break;
288 }
289
290 if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
291 ib = 0;
292 break;
293 }
294 if (((p->queue - p->pc) == PQ_CACHE) ||
295 (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
296 ib = 0;
297 break;
298 }
299 vm_page_test_dirty(p);
300 if ((p->dirty & p->valid) == 0 ||
301 p->queue != PQ_INACTIVE ||
302 p->wire_count != 0 || /* may be held by buf cache */
303 p->hold_count != 0) { /* may be undergoing I/O */
304 ib = 0;
305 break;
306 }
307 mc[--page_base] = p;
308 ++pageout_count;
309 ++ib;
310 /*
311 * alignment boundry, stop here and switch directions. Do
312 * not clear ib.
313 */
314 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
315 break;
316 }
317
318 while (pageout_count < vm_pageout_page_count &&
319 pindex + is < object->size) {
320 vm_page_t p;
321
322 if ((p = vm_page_lookup(object, pindex + is)) == NULL)
323 break;
324 if (((p->queue - p->pc) == PQ_CACHE) ||
325 (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
326 break;
327 }
328 vm_page_test_dirty(p);
329 if ((p->dirty & p->valid) == 0 ||
330 p->queue != PQ_INACTIVE ||
331 p->wire_count != 0 || /* may be held by buf cache */
332 p->hold_count != 0) { /* may be undergoing I/O */
333 break;
334 }
335 mc[page_base + pageout_count] = p;
336 ++pageout_count;
337 ++is;
338 }
339
340 /*
341 * If we exhausted our forward scan, continue with the reverse scan
342 * when possible, even past a page boundry. This catches boundry
343 * conditions.
344 */
345 if (ib && pageout_count < vm_pageout_page_count)
346 goto more;
347
348 /*
349 * we allow reads during pageouts...
350 */
351 return vm_pageout_flush(&mc[page_base], pageout_count, 0);
352 }
353
354 /*
355 * vm_pageout_flush() - launder the given pages
356 *
357 * The given pages are laundered. Note that we setup for the start of
358 * I/O ( i.e. busy the page ), mark it read-only, and bump the object
359 * reference count all in here rather then in the parent. If we want
360 * the parent to do more sophisticated things we may have to change
361 * the ordering.
362 */
363 int
364 vm_pageout_flush(mc, count, flags)
365 vm_page_t *mc;
366 int count;
367 int flags;
368 {
369 vm_object_t object;
370 int pageout_status[count];
371 int numpagedout = 0;
372 int i;
373
374 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
375 /*
376 * Initiate I/O. Bump the vm_page_t->busy counter and
377 * mark the pages read-only.
378 *
379 * We do not have to fixup the clean/dirty bits here... we can
380 * allow the pager to do it after the I/O completes.
381 *
382 * NOTE! mc[i]->dirty may be partial or fragmented due to an
383 * edge case with file fragments.
384 */
385 for (i = 0; i < count; i++) {
386 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially invalid page", mc[i], i, count));
387 vm_page_io_start(mc[i]);
388 pmap_page_protect(mc[i], VM_PROT_READ);
389 }
390 object = mc[0]->object;
391 vm_page_unlock_queues();
392 vm_object_pip_add(object, count);
393
394 vm_pager_put_pages(object, mc, count,
395 (flags | ((object == kernel_object) ? OBJPC_SYNC : 0)),
396 pageout_status);
397
398 vm_page_lock_queues();
399 for (i = 0; i < count; i++) {
400 vm_page_t mt = mc[i];
401
402 switch (pageout_status[i]) {
403 case VM_PAGER_OK:
404 numpagedout++;
405 break;
406 case VM_PAGER_PEND:
407 numpagedout++;
408 break;
409 case VM_PAGER_BAD:
410 /*
411 * Page outside of range of object. Right now we
412 * essentially lose the changes by pretending it
413 * worked.
414 */
415 pmap_clear_modify(mt);
416 vm_page_undirty(mt);
417 break;
418 case VM_PAGER_ERROR:
419 case VM_PAGER_FAIL:
420 /*
421 * If page couldn't be paged out, then reactivate the
422 * page so it doesn't clog the inactive list. (We
423 * will try paging out it again later).
424 */
425 vm_page_activate(mt);
426 break;
427 case VM_PAGER_AGAIN:
428 break;
429 }
430
431 /*
432 * If the operation is still going, leave the page busy to
433 * block all other accesses. Also, leave the paging in
434 * progress indicator set so that we don't attempt an object
435 * collapse.
436 */
437 if (pageout_status[i] != VM_PAGER_PEND) {
438 vm_object_pip_wakeup(object);
439 vm_page_io_finish(mt);
440 if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
441 pmap_page_protect(mt, VM_PROT_READ);
442 }
443 }
444 return numpagedout;
445 }
446
447 #if !defined(NO_SWAPPING)
448 /*
449 * vm_pageout_object_deactivate_pages
450 *
451 * deactivate enough pages to satisfy the inactive target
452 * requirements or if vm_page_proc_limit is set, then
453 * deactivate all of the pages in the object and its
454 * backing_objects.
455 *
456 * The object and map must be locked.
457 */
458 static void
459 vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
460 vm_map_t map;
461 vm_object_t object;
462 vm_pindex_t desired;
463 int map_remove_only;
464 {
465 vm_page_t p, next;
466 int actcount, rcount, remove_mode;
467
468 GIANT_REQUIRED;
469 if (object->type == OBJT_DEVICE || object->type == OBJT_PHYS)
470 return;
471
472 while (object) {
473 if (pmap_resident_count(vm_map_pmap(map)) <= desired)
474 return;
475 if (object->paging_in_progress)
476 return;
477
478 remove_mode = map_remove_only;
479 if (object->shadow_count > 1)
480 remove_mode = 1;
481 /*
482 * scan the objects entire memory queue
483 */
484 rcount = object->resident_page_count;
485 p = TAILQ_FIRST(&object->memq);
486 vm_page_lock_queues();
487 while (p && (rcount-- > 0)) {
488 if (pmap_resident_count(map->pmap) <= desired) {
489 vm_page_unlock_queues();
490 return;
491 }
492 next = TAILQ_NEXT(p, listq);
493 cnt.v_pdpages++;
494 if (p->wire_count != 0 ||
495 p->hold_count != 0 ||
496 p->busy != 0 ||
497 (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
498 !pmap_page_exists_quick(vm_map_pmap(map), p)) {
499 p = next;
500 continue;
501 }
502 actcount = pmap_ts_referenced(p);
503 if (actcount) {
504 vm_page_flag_set(p, PG_REFERENCED);
505 } else if (p->flags & PG_REFERENCED) {
506 actcount = 1;
507 }
508 if ((p->queue != PQ_ACTIVE) &&
509 (p->flags & PG_REFERENCED)) {
510 vm_page_activate(p);
511 p->act_count += actcount;
512 vm_page_flag_clear(p, PG_REFERENCED);
513 } else if (p->queue == PQ_ACTIVE) {
514 if ((p->flags & PG_REFERENCED) == 0) {
515 p->act_count -= min(p->act_count, ACT_DECLINE);
516 if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
517 pmap_remove_all(p);
518 vm_page_deactivate(p);
519 } else {
520 vm_pageq_requeue(p);
521 }
522 } else {
523 vm_page_activate(p);
524 vm_page_flag_clear(p, PG_REFERENCED);
525 if (p->act_count < (ACT_MAX - ACT_ADVANCE))
526 p->act_count += ACT_ADVANCE;
527 vm_pageq_requeue(p);
528 }
529 } else if (p->queue == PQ_INACTIVE) {
530 pmap_remove_all(p);
531 }
532 p = next;
533 }
534 vm_page_unlock_queues();
535 object = object->backing_object;
536 }
537 }
538
539 /*
540 * deactivate some number of pages in a map, try to do it fairly, but
541 * that is really hard to do.
542 */
543 static void
544 vm_pageout_map_deactivate_pages(map, desired)
545 vm_map_t map;
546 vm_pindex_t desired;
547 {
548 vm_map_entry_t tmpe;
549 vm_object_t obj, bigobj;
550 int nothingwired;
551
552 GIANT_REQUIRED;
553 if (!vm_map_trylock(map))
554 return;
555
556 bigobj = NULL;
557 nothingwired = TRUE;
558
559 /*
560 * first, search out the biggest object, and try to free pages from
561 * that.
562 */
563 tmpe = map->header.next;
564 while (tmpe != &map->header) {
565 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
566 obj = tmpe->object.vm_object;
567 if ((obj != NULL) && (obj->shadow_count <= 1) &&
568 ((bigobj == NULL) ||
569 (bigobj->resident_page_count < obj->resident_page_count))) {
570 bigobj = obj;
571 }
572 }
573 if (tmpe->wired_count > 0)
574 nothingwired = FALSE;
575 tmpe = tmpe->next;
576 }
577
578 if (bigobj)
579 vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
580
581 /*
582 * Next, hunt around for other pages to deactivate. We actually
583 * do this search sort of wrong -- .text first is not the best idea.
584 */
585 tmpe = map->header.next;
586 while (tmpe != &map->header) {
587 if (pmap_resident_count(vm_map_pmap(map)) <= desired)
588 break;
589 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
590 obj = tmpe->object.vm_object;
591 if (obj)
592 vm_pageout_object_deactivate_pages(map, obj, desired, 0);
593 }
594 tmpe = tmpe->next;
595 }
596
597 /*
598 * Remove all mappings if a process is swapped out, this will free page
599 * table pages.
600 */
601 if (desired == 0 && nothingwired) {
602 vm_page_lock_queues();
603 pmap_remove(vm_map_pmap(map), vm_map_min(map),
604 vm_map_max(map));
605 vm_page_unlock_queues();
606 }
607 vm_map_unlock(map);
608 }
609 #endif /* !defined(NO_SWAPPING) */
610
611 /*
612 * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
613 * to vnode deadlocks. We only do it for OBJT_DEFAULT and OBJT_SWAP objects
614 * which we know can be trivially freed.
615 */
616 void
617 vm_pageout_page_free(vm_page_t m)
618 {
619 vm_object_t object = m->object;
620 int type = object->type;
621
622 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
623 if (type == OBJT_SWAP || type == OBJT_DEFAULT)
624 vm_object_reference(object);
625 vm_page_busy(m);
626 pmap_remove_all(m);
627 vm_page_free(m);
628 cnt.v_dfree++;
629 if (type == OBJT_SWAP || type == OBJT_DEFAULT)
630 vm_object_deallocate(object);
631 }
632
633 /*
634 * This routine is very drastic, but can save the system
635 * in a pinch.
636 */
637 static void
638 vm_pageout_pmap_collect(void)
639 {
640 int i;
641 vm_page_t m;
642 static int warningdone;
643
644 if (pmap_pagedaemon_waken == 0)
645 return;
646 if (warningdone < 5) {
647 printf("collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
648 warningdone++;
649 }
650 vm_page_lock_queues();
651 for (i = 0; i < vm_page_array_size; i++) {
652 m = &vm_page_array[i];
653 if (m->wire_count || m->hold_count || m->busy ||
654 (m->flags & (PG_BUSY | PG_UNMANAGED)))
655 continue;
656 pmap_remove_all(m);
657 }
658 vm_page_unlock_queues();
659 pmap_pagedaemon_waken = 0;
660 }
661
662 /*
663 * vm_pageout_scan does the dirty work for the pageout daemon.
664 */
665 static void
666 vm_pageout_scan(int pass)
667 {
668 vm_page_t m, next;
669 struct vm_page marker;
670 int save_page_shortage;
671 int save_inactive_count;
672 int page_shortage, maxscan, pcount;
673 int addl_page_shortage, addl_page_shortage_init;
674 struct proc *p, *bigproc;
675 vm_offset_t size, bigsize;
676 vm_object_t object;
677 int actcount;
678 int vnodes_skipped = 0;
679 int maxlaunder;
680 int s;
681 struct thread *td;
682
683 GIANT_REQUIRED;
684 /*
685 * Decrease registered cache sizes.
686 */
687 EVENTHANDLER_INVOKE(vm_lowmem, 0);
688 /*
689 * We do this explicitly after the caches have been drained above.
690 */
691 uma_reclaim();
692 /*
693 * Do whatever cleanup that the pmap code can.
694 */
695 vm_pageout_pmap_collect();
696
697 addl_page_shortage_init = vm_pageout_deficit;
698 vm_pageout_deficit = 0;
699
700 /*
701 * Calculate the number of pages we want to either free or move
702 * to the cache.
703 */
704 page_shortage = vm_paging_target() + addl_page_shortage_init;
705 save_page_shortage = page_shortage;
706 save_inactive_count = cnt.v_inactive_count;
707
708 /*
709 * Initialize our marker
710 */
711 bzero(&marker, sizeof(marker));
712 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
713 marker.queue = PQ_INACTIVE;
714 marker.wire_count = 1;
715
716 /*
717 * Start scanning the inactive queue for pages we can move to the
718 * cache or free. The scan will stop when the target is reached or
719 * we have scanned the entire inactive queue. Note that m->act_count
720 * is not used to form decisions for the inactive queue, only for the
721 * active queue.
722 *
723 * maxlaunder limits the number of dirty pages we flush per scan.
724 * For most systems a smaller value (16 or 32) is more robust under
725 * extreme memory and disk pressure because any unnecessary writes
726 * to disk can result in extreme performance degredation. However,
727 * systems with excessive dirty pages (especially when MAP_NOSYNC is
728 * used) will die horribly with limited laundering. If the pageout
729 * daemon cannot clean enough pages in the first pass, we let it go
730 * all out in succeeding passes.
731 */
732 if ((maxlaunder = vm_max_launder) <= 1)
733 maxlaunder = 1;
734 if (pass)
735 maxlaunder = 10000;
736 rescan0:
737 addl_page_shortage = addl_page_shortage_init;
738 maxscan = cnt.v_inactive_count;
739
740 for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
741 m != NULL && maxscan-- > 0 && page_shortage > 0;
742 m = next) {
743
744 cnt.v_pdpages++;
745
746 if (m->queue != PQ_INACTIVE) {
747 goto rescan0;
748 }
749
750 next = TAILQ_NEXT(m, pageq);
751
752 /*
753 * skip marker pages
754 */
755 if (m->flags & PG_MARKER)
756 continue;
757
758 /*
759 * A held page may be undergoing I/O, so skip it.
760 */
761 if (m->hold_count) {
762 vm_pageq_requeue(m);
763 addl_page_shortage++;
764 continue;
765 }
766 /*
767 * Don't mess with busy pages, keep in the front of the
768 * queue, most likely are being paged out.
769 */
770 if (m->busy || (m->flags & PG_BUSY)) {
771 addl_page_shortage++;
772 continue;
773 }
774
775 vm_page_lock_queues();
776 /*
777 * If the object is not being used, we ignore previous
778 * references.
779 */
780 if (m->object->ref_count == 0) {
781 vm_page_flag_clear(m, PG_REFERENCED);
782 pmap_clear_reference(m);
783
784 /*
785 * Otherwise, if the page has been referenced while in the
786 * inactive queue, we bump the "activation count" upwards,
787 * making it less likely that the page will be added back to
788 * the inactive queue prematurely again. Here we check the
789 * page tables (or emulated bits, if any), given the upper
790 * level VM system not knowing anything about existing
791 * references.
792 */
793 } else if (((m->flags & PG_REFERENCED) == 0) &&
794 (actcount = pmap_ts_referenced(m))) {
795 vm_page_activate(m);
796 vm_page_unlock_queues();
797 m->act_count += (actcount + ACT_ADVANCE);
798 continue;
799 }
800
801 /*
802 * If the upper level VM system knows about any page
803 * references, we activate the page. We also set the
804 * "activation count" higher than normal so that we will less
805 * likely place pages back onto the inactive queue again.
806 */
807 if ((m->flags & PG_REFERENCED) != 0) {
808 vm_page_flag_clear(m, PG_REFERENCED);
809 actcount = pmap_ts_referenced(m);
810 vm_page_activate(m);
811 vm_page_unlock_queues();
812 m->act_count += (actcount + ACT_ADVANCE + 1);
813 continue;
814 }
815
816 /*
817 * If the upper level VM system doesn't know anything about
818 * the page being dirty, we have to check for it again. As
819 * far as the VM code knows, any partially dirty pages are
820 * fully dirty.
821 */
822 if (m->dirty == 0) {
823 vm_page_test_dirty(m);
824 } else {
825 vm_page_dirty(m);
826 }
827 vm_page_unlock_queues();
828
829 /*
830 * Invalid pages can be easily freed
831 */
832 if (m->valid == 0) {
833 vm_page_lock_queues();
834 vm_pageout_page_free(m);
835 vm_page_unlock_queues();
836 --page_shortage;
837
838 /*
839 * Clean pages can be placed onto the cache queue. This
840 * effectively frees them.
841 */
842 } else if (m->dirty == 0) {
843 vm_page_lock_queues();
844 vm_page_cache(m);
845 vm_page_unlock_queues();
846 --page_shortage;
847 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
848 /*
849 * Dirty pages need to be paged out, but flushing
850 * a page is extremely expensive verses freeing
851 * a clean page. Rather then artificially limiting
852 * the number of pages we can flush, we instead give
853 * dirty pages extra priority on the inactive queue
854 * by forcing them to be cycled through the queue
855 * twice before being flushed, after which the
856 * (now clean) page will cycle through once more
857 * before being freed. This significantly extends
858 * the thrash point for a heavily loaded machine.
859 */
860 vm_page_flag_set(m, PG_WINATCFLS);
861 vm_pageq_requeue(m);
862 } else if (maxlaunder > 0) {
863 /*
864 * We always want to try to flush some dirty pages if
865 * we encounter them, to keep the system stable.
866 * Normally this number is small, but under extreme
867 * pressure where there are insufficient clean pages
868 * on the inactive queue, we may have to go all out.
869 */
870 int swap_pageouts_ok;
871 struct vnode *vp = NULL;
872 struct mount *mp;
873
874 object = m->object;
875
876 if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
877 swap_pageouts_ok = 1;
878 } else {
879 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
880 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
881 vm_page_count_min());
882
883 }
884
885 /*
886 * We don't bother paging objects that are "dead".
887 * Those objects are in a "rundown" state.
888 */
889 if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
890 vm_pageq_requeue(m);
891 continue;
892 }
893
894 /*
895 * The object is already known NOT to be dead. It
896 * is possible for the vget() to block the whole
897 * pageout daemon, but the new low-memory handling
898 * code should prevent it.
899 *
900 * The previous code skipped locked vnodes and, worse,
901 * reordered pages in the queue. This results in
902 * completely non-deterministic operation and, on a
903 * busy system, can lead to extremely non-optimal
904 * pageouts. For example, it can cause clean pages
905 * to be freed and dirty pages to be moved to the end
906 * of the queue. Since dirty pages are also moved to
907 * the end of the queue once-cleaned, this gives
908 * way too large a weighting to defering the freeing
909 * of dirty pages.
910 *
911 * We can't wait forever for the vnode lock, we might
912 * deadlock due to a vn_read() getting stuck in
913 * vm_wait while holding this vnode. We skip the
914 * vnode if we can't get it in a reasonable amount
915 * of time.
916 */
917 if (object->type == OBJT_VNODE) {
918 vp = object->handle;
919
920 mp = NULL;
921 if (vp->v_type == VREG)
922 vn_start_write(vp, &mp, V_NOWAIT);
923 if (vget(vp, LK_EXCLUSIVE|LK_TIMELOCK, curthread)) {
924 ++pageout_lock_miss;
925 vn_finished_write(mp);
926 if (object->flags & OBJ_MIGHTBEDIRTY)
927 vnodes_skipped++;
928 continue;
929 }
930
931 /*
932 * The page might have been moved to another
933 * queue during potential blocking in vget()
934 * above. The page might have been freed and
935 * reused for another vnode. The object might
936 * have been reused for another vnode.
937 */
938 if (m->queue != PQ_INACTIVE ||
939 m->object != object ||
940 object->handle != vp) {
941 if (object->flags & OBJ_MIGHTBEDIRTY)
942 vnodes_skipped++;
943 vput(vp);
944 vn_finished_write(mp);
945 continue;
946 }
947
948 /*
949 * The page may have been busied during the
950 * blocking in vput(); We don't move the
951 * page back onto the end of the queue so that
952 * statistics are more correct if we don't.
953 */
954 if (m->busy || (m->flags & PG_BUSY)) {
955 vput(vp);
956 vn_finished_write(mp);
957 continue;
958 }
959
960 /*
961 * If the page has become held it might
962 * be undergoing I/O, so skip it
963 */
964 if (m->hold_count) {
965 vm_pageq_requeue(m);
966 if (object->flags & OBJ_MIGHTBEDIRTY)
967 vnodes_skipped++;
968 vput(vp);
969 vn_finished_write(mp);
970 continue;
971 }
972 }
973
974 /*
975 * If a page is dirty, then it is either being washed
976 * (but not yet cleaned) or it is still in the
977 * laundry. If it is still in the laundry, then we
978 * start the cleaning operation.
979 *
980 * This operation may cluster, invalidating the 'next'
981 * pointer. To prevent an inordinate number of
982 * restarts we use our marker to remember our place.
983 *
984 * decrement page_shortage on success to account for
985 * the (future) cleaned page. Otherwise we could wind
986 * up laundering or cleaning too many pages.
987 */
988 vm_page_lock_queues();
989 s = splvm();
990 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
991 splx(s);
992 if (vm_pageout_clean(m) != 0) {
993 --page_shortage;
994 --maxlaunder;
995 }
996 s = splvm();
997 next = TAILQ_NEXT(&marker, pageq);
998 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
999 splx(s);
1000 vm_page_unlock_queues();
1001 if (vp) {
1002 vput(vp);
1003 vn_finished_write(mp);
1004 }
1005 }
1006 }
1007
1008 /*
1009 * Compute the number of pages we want to try to move from the
1010 * active queue to the inactive queue.
1011 */
1012 page_shortage = vm_paging_target() +
1013 cnt.v_inactive_target - cnt.v_inactive_count;
1014 page_shortage += addl_page_shortage;
1015
1016 vm_page_lock_queues();
1017 /*
1018 * Scan the active queue for things we can deactivate. We nominally
1019 * track the per-page activity counter and use it to locate
1020 * deactivation candidates.
1021 */
1022 pcount = cnt.v_active_count;
1023 m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1024
1025 while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
1026
1027 /*
1028 * This is a consistency check, and should likely be a panic
1029 * or warning.
1030 */
1031 if (m->queue != PQ_ACTIVE) {
1032 break;
1033 }
1034
1035 next = TAILQ_NEXT(m, pageq);
1036 /*
1037 * Don't deactivate pages that are busy.
1038 */
1039 if ((m->busy != 0) ||
1040 (m->flags & PG_BUSY) ||
1041 (m->hold_count != 0)) {
1042 vm_pageq_requeue(m);
1043 m = next;
1044 continue;
1045 }
1046
1047 /*
1048 * The count for pagedaemon pages is done after checking the
1049 * page for eligibility...
1050 */
1051 cnt.v_pdpages++;
1052
1053 /*
1054 * Check to see "how much" the page has been used.
1055 */
1056 actcount = 0;
1057 if (m->object->ref_count != 0) {
1058 if (m->flags & PG_REFERENCED) {
1059 actcount += 1;
1060 }
1061 actcount += pmap_ts_referenced(m);
1062 if (actcount) {
1063 m->act_count += ACT_ADVANCE + actcount;
1064 if (m->act_count > ACT_MAX)
1065 m->act_count = ACT_MAX;
1066 }
1067 }
1068
1069 /*
1070 * Since we have "tested" this bit, we need to clear it now.
1071 */
1072 vm_page_flag_clear(m, PG_REFERENCED);
1073
1074 /*
1075 * Only if an object is currently being used, do we use the
1076 * page activation count stats.
1077 */
1078 if (actcount && (m->object->ref_count != 0)) {
1079 vm_pageq_requeue(m);
1080 } else {
1081 m->act_count -= min(m->act_count, ACT_DECLINE);
1082 if (vm_pageout_algorithm ||
1083 m->object->ref_count == 0 ||
1084 m->act_count == 0) {
1085 page_shortage--;
1086 if (m->object->ref_count == 0) {
1087 pmap_remove_all(m);
1088 if (m->dirty == 0)
1089 vm_page_cache(m);
1090 else
1091 vm_page_deactivate(m);
1092 } else {
1093 vm_page_deactivate(m);
1094 }
1095 } else {
1096 vm_pageq_requeue(m);
1097 }
1098 }
1099 m = next;
1100 }
1101 s = splvm();
1102
1103 /*
1104 * We try to maintain some *really* free pages, this allows interrupt
1105 * code to be guaranteed space. Since both cache and free queues
1106 * are considered basically 'free', moving pages from cache to free
1107 * does not effect other calculations.
1108 */
1109 while (cnt.v_free_count < cnt.v_free_reserved) {
1110 static int cache_rover = 0;
1111 m = vm_pageq_find(PQ_CACHE, cache_rover, FALSE);
1112 if (!m)
1113 break;
1114 if ((m->flags & (PG_BUSY|PG_UNMANAGED)) ||
1115 m->busy ||
1116 m->hold_count ||
1117 m->wire_count) {
1118 #ifdef INVARIANTS
1119 printf("Warning: busy page %p found in cache\n", m);
1120 #endif
1121 vm_page_deactivate(m);
1122 continue;
1123 }
1124 cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK;
1125 vm_pageout_page_free(m);
1126 }
1127 splx(s);
1128 vm_page_unlock_queues();
1129 #if !defined(NO_SWAPPING)
1130 /*
1131 * Idle process swapout -- run once per second.
1132 */
1133 if (vm_swap_idle_enabled) {
1134 static long lsec;
1135 if (time_second != lsec) {
1136 vm_pageout_req_swapout |= VM_SWAP_IDLE;
1137 vm_req_vmdaemon();
1138 lsec = time_second;
1139 }
1140 }
1141 #endif
1142
1143 /*
1144 * If we didn't get enough free pages, and we have skipped a vnode
1145 * in a writeable object, wakeup the sync daemon. And kick swapout
1146 * if we did not get enough free pages.
1147 */
1148 if (vm_paging_target() > 0) {
1149 if (vnodes_skipped && vm_page_count_min())
1150 (void) speedup_syncer();
1151 #if !defined(NO_SWAPPING)
1152 if (vm_swap_enabled && vm_page_count_target()) {
1153 vm_req_vmdaemon();
1154 vm_pageout_req_swapout |= VM_SWAP_NORMAL;
1155 }
1156 #endif
1157 }
1158
1159 /*
1160 * If we are out of swap and were not able to reach our paging
1161 * target, kill the largest process.
1162 *
1163 * We keep the process bigproc locked once we find it to keep anyone
1164 * from messing with it; however, there is a possibility of
1165 * deadlock if process B is bigproc and one of it's child processes
1166 * attempts to propagate a signal to B while we are waiting for A's
1167 * lock while walking this list. To avoid this, we don't block on
1168 * the process lock but just skip a process if it is already locked.
1169 */
1170 if ((vm_swap_size < 64 && vm_page_count_min()) ||
1171 (swap_pager_full && vm_paging_target() > 0)) {
1172 #if 0
1173 if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
1174 #endif
1175 bigproc = NULL;
1176 bigsize = 0;
1177 sx_slock(&allproc_lock);
1178 FOREACH_PROC_IN_SYSTEM(p) {
1179 int breakout;
1180 /*
1181 * If this process is already locked, skip it.
1182 */
1183 if (PROC_TRYLOCK(p) == 0)
1184 continue;
1185 /*
1186 * if this is a system process, skip it
1187 */
1188 if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
1189 ((p->p_pid < 48) && (vm_swap_size != 0))) {
1190 PROC_UNLOCK(p);
1191 continue;
1192 }
1193 /*
1194 * if the process is in a non-running type state,
1195 * don't touch it. Check all the threads individually.
1196 */
1197 mtx_lock_spin(&sched_lock);
1198 breakout = 0;
1199 FOREACH_THREAD_IN_PROC(p, td) {
1200 if (!TD_ON_RUNQ(td) &&
1201 !TD_IS_RUNNING(td) &&
1202 !TD_IS_SLEEPING(td)) {
1203 breakout = 1;
1204 break;
1205 }
1206 }
1207 if (breakout) {
1208 mtx_unlock_spin(&sched_lock);
1209 PROC_UNLOCK(p);
1210 continue;
1211 }
1212 mtx_unlock_spin(&sched_lock);
1213 /*
1214 * get the process size
1215 */
1216 size = vmspace_resident_count(p->p_vmspace) +
1217 vmspace_swap_count(p->p_vmspace);
1218 /*
1219 * if the this process is bigger than the biggest one
1220 * remember it.
1221 */
1222 if (size > bigsize) {
1223 if (bigproc != NULL)
1224 PROC_UNLOCK(bigproc);
1225 bigproc = p;
1226 bigsize = size;
1227 } else
1228 PROC_UNLOCK(p);
1229 }
1230 sx_sunlock(&allproc_lock);
1231 if (bigproc != NULL) {
1232 struct ksegrp *kg;
1233 killproc(bigproc, "out of swap space");
1234 mtx_lock_spin(&sched_lock);
1235 FOREACH_KSEGRP_IN_PROC(bigproc, kg) {
1236 sched_nice(kg, PRIO_MIN); /* XXXKSE ??? */
1237 }
1238 mtx_unlock_spin(&sched_lock);
1239 PROC_UNLOCK(bigproc);
1240 wakeup(&cnt.v_free_count);
1241 }
1242 }
1243 }
1244
1245 /*
1246 * This routine tries to maintain the pseudo LRU active queue,
1247 * so that during long periods of time where there is no paging,
1248 * that some statistic accumulation still occurs. This code
1249 * helps the situation where paging just starts to occur.
1250 */
1251 static void
1252 vm_pageout_page_stats()
1253 {
1254 vm_page_t m,next;
1255 int pcount,tpcount; /* Number of pages to check */
1256 static int fullintervalcount = 0;
1257 int page_shortage;
1258 int s0;
1259
1260 page_shortage =
1261 (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
1262 (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
1263
1264 if (page_shortage <= 0)
1265 return;
1266
1267 s0 = splvm();
1268 vm_page_lock_queues();
1269 pcount = cnt.v_active_count;
1270 fullintervalcount += vm_pageout_stats_interval;
1271 if (fullintervalcount < vm_pageout_full_stats_interval) {
1272 tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
1273 if (pcount > tpcount)
1274 pcount = tpcount;
1275 } else {
1276 fullintervalcount = 0;
1277 }
1278
1279 m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1280 while ((m != NULL) && (pcount-- > 0)) {
1281 int actcount;
1282
1283 if (m->queue != PQ_ACTIVE) {
1284 break;
1285 }
1286
1287 next = TAILQ_NEXT(m, pageq);
1288 /*
1289 * Don't deactivate pages that are busy.
1290 */
1291 if ((m->busy != 0) ||
1292 (m->flags & PG_BUSY) ||
1293 (m->hold_count != 0)) {
1294 vm_pageq_requeue(m);
1295 m = next;
1296 continue;
1297 }
1298
1299 actcount = 0;
1300 if (m->flags & PG_REFERENCED) {
1301 vm_page_flag_clear(m, PG_REFERENCED);
1302 actcount += 1;
1303 }
1304
1305 actcount += pmap_ts_referenced(m);
1306 if (actcount) {
1307 m->act_count += ACT_ADVANCE + actcount;
1308 if (m->act_count > ACT_MAX)
1309 m->act_count = ACT_MAX;
1310 vm_pageq_requeue(m);
1311 } else {
1312 if (m->act_count == 0) {
1313 /*
1314 * We turn off page access, so that we have
1315 * more accurate RSS stats. We don't do this
1316 * in the normal page deactivation when the
1317 * system is loaded VM wise, because the
1318 * cost of the large number of page protect
1319 * operations would be higher than the value
1320 * of doing the operation.
1321 */
1322 pmap_remove_all(m);
1323 vm_page_deactivate(m);
1324 } else {
1325 m->act_count -= min(m->act_count, ACT_DECLINE);
1326 vm_pageq_requeue(m);
1327 }
1328 }
1329
1330 m = next;
1331 }
1332 vm_page_unlock_queues();
1333 splx(s0);
1334 }
1335
1336 static int
1337 vm_pageout_free_page_calc(count)
1338 vm_size_t count;
1339 {
1340 if (count < cnt.v_page_count)
1341 return 0;
1342 /*
1343 * free_reserved needs to include enough for the largest swap pager
1344 * structures plus enough for any pv_entry structs when paging.
1345 */
1346 if (cnt.v_page_count > 1024)
1347 cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
1348 else
1349 cnt.v_free_min = 4;
1350 cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
1351 cnt.v_interrupt_free_min;
1352 cnt.v_free_reserved = vm_pageout_page_count +
1353 cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
1354 cnt.v_free_severe = cnt.v_free_min / 2;
1355 cnt.v_free_min += cnt.v_free_reserved;
1356 cnt.v_free_severe += cnt.v_free_reserved;
1357 return 1;
1358 }
1359
1360 /*
1361 * vm_pageout is the high level pageout daemon.
1362 */
1363 static void
1364 vm_pageout()
1365 {
1366 int pass;
1367
1368 mtx_lock(&Giant);
1369
1370 /*
1371 * Initialize some paging parameters.
1372 */
1373 cnt.v_interrupt_free_min = 2;
1374 if (cnt.v_page_count < 2000)
1375 vm_pageout_page_count = 8;
1376
1377 vm_pageout_free_page_calc(cnt.v_page_count);
1378 /*
1379 * v_free_target and v_cache_min control pageout hysteresis. Note
1380 * that these are more a measure of the VM cache queue hysteresis
1381 * then the VM free queue. Specifically, v_free_target is the
1382 * high water mark (free+cache pages).
1383 *
1384 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
1385 * low water mark, while v_free_min is the stop. v_cache_min must
1386 * be big enough to handle memory needs while the pageout daemon
1387 * is signalled and run to free more pages.
1388 */
1389 if (cnt.v_free_count > 6144)
1390 cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
1391 else
1392 cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
1393
1394 if (cnt.v_free_count > 2048) {
1395 cnt.v_cache_min = cnt.v_free_target;
1396 cnt.v_cache_max = 2 * cnt.v_cache_min;
1397 cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
1398 } else {
1399 cnt.v_cache_min = 0;
1400 cnt.v_cache_max = 0;
1401 cnt.v_inactive_target = cnt.v_free_count / 4;
1402 }
1403 if (cnt.v_inactive_target > cnt.v_free_count / 3)
1404 cnt.v_inactive_target = cnt.v_free_count / 3;
1405
1406 /* XXX does not really belong here */
1407 if (vm_page_max_wired == 0)
1408 vm_page_max_wired = cnt.v_free_count / 3;
1409
1410 if (vm_pageout_stats_max == 0)
1411 vm_pageout_stats_max = cnt.v_free_target;
1412
1413 /*
1414 * Set interval in seconds for stats scan.
1415 */
1416 if (vm_pageout_stats_interval == 0)
1417 vm_pageout_stats_interval = 5;
1418 if (vm_pageout_full_stats_interval == 0)
1419 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
1420
1421 /*
1422 * Set maximum free per pass
1423 */
1424 if (vm_pageout_stats_free_max == 0)
1425 vm_pageout_stats_free_max = 5;
1426
1427 swap_pager_swap_init();
1428 pass = 0;
1429 /*
1430 * The pageout daemon is never done, so loop forever.
1431 */
1432 while (TRUE) {
1433 int error;
1434 int s = splvm();
1435
1436 /*
1437 * If we have enough free memory, wakeup waiters. Do
1438 * not clear vm_pages_needed until we reach our target,
1439 * otherwise we may be woken up over and over again and
1440 * waste a lot of cpu.
1441 */
1442 if (vm_pages_needed && !vm_page_count_min()) {
1443 if (vm_paging_needed() <= 0)
1444 vm_pages_needed = 0;
1445 wakeup(&cnt.v_free_count);
1446 }
1447 if (vm_pages_needed) {
1448 /*
1449 * Still not done, take a second pass without waiting
1450 * (unlimited dirty cleaning), otherwise sleep a bit
1451 * and try again.
1452 */
1453 ++pass;
1454 if (pass > 1)
1455 tsleep(&vm_pages_needed, PVM,
1456 "psleep", hz/2);
1457 } else {
1458 /*
1459 * Good enough, sleep & handle stats. Prime the pass
1460 * for the next run.
1461 */
1462 if (pass > 1)
1463 pass = 1;
1464 else
1465 pass = 0;
1466 error = tsleep(&vm_pages_needed, PVM,
1467 "psleep", vm_pageout_stats_interval * hz);
1468 if (error && !vm_pages_needed) {
1469 splx(s);
1470 pass = 0;
1471 vm_pageout_page_stats();
1472 continue;
1473 }
1474 }
1475
1476 if (vm_pages_needed)
1477 cnt.v_pdwakeups++;
1478 splx(s);
1479 vm_pageout_scan(pass);
1480 vm_pageout_deficit = 0;
1481 }
1482 }
1483
1484 void
1485 pagedaemon_wakeup()
1486 {
1487 if (!vm_pages_needed && curthread->td_proc != pageproc) {
1488 vm_pages_needed++;
1489 wakeup(&vm_pages_needed);
1490 }
1491 }
1492
1493 #if !defined(NO_SWAPPING)
1494 static void
1495 vm_req_vmdaemon()
1496 {
1497 static int lastrun = 0;
1498
1499 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
1500 wakeup(&vm_daemon_needed);
1501 lastrun = ticks;
1502 }
1503 }
1504
1505 static void
1506 vm_daemon()
1507 {
1508 struct proc *p;
1509 int breakout;
1510 struct thread *td;
1511
1512 mtx_lock(&Giant);
1513 while (TRUE) {
1514 tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
1515 if (vm_pageout_req_swapout) {
1516 swapout_procs(vm_pageout_req_swapout);
1517 vm_pageout_req_swapout = 0;
1518 }
1519 /*
1520 * scan the processes for exceeding their rlimits or if
1521 * process is swapped out -- deactivate pages
1522 */
1523 sx_slock(&allproc_lock);
1524 LIST_FOREACH(p, &allproc, p_list) {
1525 vm_pindex_t limit, size;
1526
1527 /*
1528 * if this is a system process or if we have already
1529 * looked at this process, skip it.
1530 */
1531 if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
1532 continue;
1533 }
1534 /*
1535 * if the process is in a non-running type state,
1536 * don't touch it.
1537 */
1538 mtx_lock_spin(&sched_lock);
1539 breakout = 0;
1540 FOREACH_THREAD_IN_PROC(p, td) {
1541 if (!TD_ON_RUNQ(td) &&
1542 !TD_IS_RUNNING(td) &&
1543 !TD_IS_SLEEPING(td)) {
1544 breakout = 1;
1545 break;
1546 }
1547 }
1548 if (breakout) {
1549 mtx_unlock_spin(&sched_lock);
1550 continue;
1551 }
1552 /*
1553 * get a limit
1554 */
1555 limit = OFF_TO_IDX(
1556 qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
1557 p->p_rlimit[RLIMIT_RSS].rlim_max));
1558
1559 /*
1560 * let processes that are swapped out really be
1561 * swapped out set the limit to nothing (will force a
1562 * swap-out.)
1563 */
1564 if ((p->p_sflag & PS_INMEM) == 0)
1565 limit = 0; /* XXX */
1566 mtx_unlock_spin(&sched_lock);
1567
1568 size = vmspace_resident_count(p->p_vmspace);
1569 if (limit >= 0 && size >= limit) {
1570 vm_pageout_map_deactivate_pages(
1571 &p->p_vmspace->vm_map, limit);
1572 }
1573 }
1574 sx_sunlock(&allproc_lock);
1575 }
1576 }
1577 #endif /* !defined(NO_SWAPPING) */
Cache object: b729ae2fa124db359a8222ebba096046
|