FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_pageout.c
1 /*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
37 *
38 *
39 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40 * All rights reserved.
41 *
42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43 *
44 * Permission to use, copy, modify and distribute this software and
45 * its documentation is hereby granted, provided that both the copyright
46 * notice and this permission notice appear in all copies of the
47 * software, derivative works or modified versions, and any portions
48 * thereof, and that both notices appear in supporting documentation.
49 *
50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53 *
54 * Carnegie Mellon requests users of this software to return to
55 *
56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
57 * School of Computer Science
58 * Carnegie Mellon University
59 * Pittsburgh PA 15213-3890
60 *
61 * any improvements or extensions that they make and grant Carnegie the
62 * rights to redistribute these changes.
63 *
64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
65 */
66
67 /*
68 * The proverbial page-out daemon.
69 */
70
71 #include "opt_vm.h"
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/kernel.h>
75 #include <sys/proc.h>
76 #include <sys/kthread.h>
77 #include <sys/resourcevar.h>
78 #include <sys/signalvar.h>
79 #include <sys/vnode.h>
80 #include <sys/vmmeter.h>
81 #include <sys/sysctl.h>
82
83 #include <vm/vm.h>
84 #include <vm/vm_param.h>
85 #include <sys/lock.h>
86 #include <vm/vm_object.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_map.h>
89 #include <vm/vm_pageout.h>
90 #include <vm/vm_pager.h>
91 #include <vm/swap_pager.h>
92 #include <vm/vm_extern.h>
93
94 #include <sys/thread2.h>
95 #include <sys/spinlock2.h>
96 #include <vm/vm_page2.h>
97
98 /*
99 * System initialization
100 */
101
102 /* the kernel process "vm_pageout"*/
103 static int vm_pageout_clean (vm_page_t);
104 static int vm_pageout_free_page_calc (vm_size_t count);
105 struct thread *pagethread;
106
107 #if !defined(NO_SWAPPING)
108 /* the kernel process "vm_daemon"*/
109 static void vm_daemon (void);
110 static struct thread *vmthread;
111
112 static struct kproc_desc vm_kp = {
113 "vmdaemon",
114 vm_daemon,
115 &vmthread
116 };
117 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
118 #endif
119
120 int vm_pages_needed=0; /* Event on which pageout daemon sleeps */
121 int vm_pageout_deficit=0; /* Estimated number of pages deficit */
122 int vm_pageout_pages_needed=0; /* pageout daemon needs pages */
123 int vm_page_free_hysteresis = 16;
124
125 #if !defined(NO_SWAPPING)
126 static int vm_pageout_req_swapout; /* XXX */
127 static int vm_daemon_needed;
128 #endif
129 static int vm_max_launder = 4096;
130 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
131 static int vm_pageout_full_stats_interval = 0;
132 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
133 static int defer_swap_pageouts=0;
134 static int disable_swap_pageouts=0;
135 static u_int vm_anonmem_decline = ACT_DECLINE;
136 static u_int vm_filemem_decline = ACT_DECLINE * 2;
137
138 #if defined(NO_SWAPPING)
139 static int vm_swap_enabled=0;
140 static int vm_swap_idle_enabled=0;
141 #else
142 static int vm_swap_enabled=1;
143 static int vm_swap_idle_enabled=0;
144 #endif
145
146 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
147 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
148
149 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
150 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
151
152 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
153 CTLFLAG_RW, &vm_page_free_hysteresis, 0,
154 "Free more pages than the minimum required");
155
156 SYSCTL_INT(_vm, OID_AUTO, max_launder,
157 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
158
159 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
160 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
161
162 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
163 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
164
165 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
166 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
167
168 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
169 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
170
171 #if defined(NO_SWAPPING)
172 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
173 CTLFLAG_RD, &vm_swap_enabled, 0, "");
174 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
175 CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
176 #else
177 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
178 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
179 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
180 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
181 #endif
182
183 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
184 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
185
186 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
187 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
188
189 static int pageout_lock_miss;
190 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
191 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
192
193 int vm_page_max_wired; /* XXX max # of wired pages system-wide */
194
195 #if !defined(NO_SWAPPING)
196 typedef void freeer_fcn_t (vm_map_t, vm_object_t, vm_pindex_t, int);
197 static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t);
198 static freeer_fcn_t vm_pageout_object_deactivate_pages;
199 static void vm_req_vmdaemon (void);
200 #endif
201 static void vm_pageout_page_stats(int q);
202
203 static __inline int
204 PQAVERAGE(int n)
205 {
206 if (n >= 0)
207 return((n + (PQ_L2_SIZE - 1)) / PQ_L2_SIZE + 1);
208 else
209 return((n - (PQ_L2_SIZE - 1)) / PQ_L2_SIZE - 1);
210 }
211
212 /*
213 * vm_pageout_clean:
214 *
215 * Clean the page and remove it from the laundry. The page must not be
216 * busy on-call.
217 *
218 * We set the busy bit to cause potential page faults on this page to
219 * block. Note the careful timing, however, the busy bit isn't set till
220 * late and we cannot do anything that will mess with the page.
221 */
222 static int
223 vm_pageout_clean(vm_page_t m)
224 {
225 vm_object_t object;
226 vm_page_t mc[BLIST_MAX_ALLOC];
227 int error;
228 int ib, is, page_base;
229 vm_pindex_t pindex = m->pindex;
230
231 object = m->object;
232
233 /*
234 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
235 * with the new swapper, but we could have serious problems paging
236 * out other object types if there is insufficient memory.
237 *
238 * Unfortunately, checking free memory here is far too late, so the
239 * check has been moved up a procedural level.
240 */
241
242 /*
243 * Don't mess with the page if it's busy, held, or special
244 *
245 * XXX do we really need to check hold_count here? hold_count
246 * isn't supposed to mess with vm_page ops except prevent the
247 * page from being reused.
248 */
249 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) {
250 vm_page_wakeup(m);
251 return 0;
252 }
253
254 /*
255 * Place page in cluster. Align cluster for optimal swap space
256 * allocation (whether it is swap or not). This is typically ~16-32
257 * pages, which also tends to align the cluster to multiples of the
258 * filesystem block size if backed by a filesystem.
259 */
260 page_base = pindex % BLIST_MAX_ALLOC;
261 mc[page_base] = m;
262 ib = page_base - 1;
263 is = page_base + 1;
264
265 /*
266 * Scan object for clusterable pages.
267 *
268 * We can cluster ONLY if: ->> the page is NOT
269 * clean, wired, busy, held, or mapped into a
270 * buffer, and one of the following:
271 * 1) The page is inactive, or a seldom used
272 * active page.
273 * -or-
274 * 2) we force the issue.
275 *
276 * During heavy mmap/modification loads the pageout
277 * daemon can really fragment the underlying file
278 * due to flushing pages out of order and not trying
279 * align the clusters (which leave sporatic out-of-order
280 * holes). To solve this problem we do the reverse scan
281 * first and attempt to align our cluster, then do a
282 * forward scan if room remains.
283 */
284
285 vm_object_hold(object);
286 while (ib >= 0) {
287 vm_page_t p;
288
289 p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
290 TRUE, &error);
291 if (error || p == NULL)
292 break;
293 if ((p->queue - p->pc) == PQ_CACHE ||
294 (p->flags & PG_UNMANAGED)) {
295 vm_page_wakeup(p);
296 break;
297 }
298 vm_page_test_dirty(p);
299 if (((p->dirty & p->valid) == 0 &&
300 (p->flags & PG_NEED_COMMIT) == 0) ||
301 p->queue - p->pc != PQ_INACTIVE ||
302 p->wire_count != 0 || /* may be held by buf cache */
303 p->hold_count != 0) { /* may be undergoing I/O */
304 vm_page_wakeup(p);
305 break;
306 }
307 mc[ib] = p;
308 --ib;
309 }
310 ++ib; /* fixup */
311
312 while (is < BLIST_MAX_ALLOC &&
313 pindex - page_base + is < object->size) {
314 vm_page_t p;
315
316 p = vm_page_lookup_busy_try(object, pindex - page_base + is,
317 TRUE, &error);
318 if (error || p == NULL)
319 break;
320 if (((p->queue - p->pc) == PQ_CACHE) ||
321 (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
322 vm_page_wakeup(p);
323 break;
324 }
325 vm_page_test_dirty(p);
326 if (((p->dirty & p->valid) == 0 &&
327 (p->flags & PG_NEED_COMMIT) == 0) ||
328 p->queue - p->pc != PQ_INACTIVE ||
329 p->wire_count != 0 || /* may be held by buf cache */
330 p->hold_count != 0) { /* may be undergoing I/O */
331 vm_page_wakeup(p);
332 break;
333 }
334 mc[is] = p;
335 ++is;
336 }
337
338 vm_object_drop(object);
339
340 /*
341 * we allow reads during pageouts...
342 */
343 return vm_pageout_flush(&mc[ib], is - ib, 0);
344 }
345
346 /*
347 * vm_pageout_flush() - launder the given pages
348 *
349 * The given pages are laundered. Note that we setup for the start of
350 * I/O ( i.e. busy the page ), mark it read-only, and bump the object
351 * reference count all in here rather then in the parent. If we want
352 * the parent to do more sophisticated things we may have to change
353 * the ordering.
354 *
355 * The pages in the array must be busied by the caller and will be
356 * unbusied by this function.
357 */
358 int
359 vm_pageout_flush(vm_page_t *mc, int count, int flags)
360 {
361 vm_object_t object;
362 int pageout_status[count];
363 int numpagedout = 0;
364 int i;
365
366 /*
367 * Initiate I/O. Bump the vm_page_t->busy counter.
368 */
369 for (i = 0; i < count; i++) {
370 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
371 ("vm_pageout_flush page %p index %d/%d: partially "
372 "invalid page", mc[i], i, count));
373 vm_page_io_start(mc[i]);
374 }
375
376 /*
377 * We must make the pages read-only. This will also force the
378 * modified bit in the related pmaps to be cleared. The pager
379 * cannot clear the bit for us since the I/O completion code
380 * typically runs from an interrupt. The act of making the page
381 * read-only handles the case for us.
382 *
383 * Then we can unbusy the pages, we still hold a reference by virtue
384 * of our soft-busy.
385 */
386 for (i = 0; i < count; i++) {
387 vm_page_protect(mc[i], VM_PROT_READ);
388 vm_page_wakeup(mc[i]);
389 }
390
391 object = mc[0]->object;
392 vm_object_pip_add(object, count);
393
394 vm_pager_put_pages(object, mc, count,
395 (flags | ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
396 pageout_status);
397
398 for (i = 0; i < count; i++) {
399 vm_page_t mt = mc[i];
400
401 switch (pageout_status[i]) {
402 case VM_PAGER_OK:
403 numpagedout++;
404 break;
405 case VM_PAGER_PEND:
406 numpagedout++;
407 break;
408 case VM_PAGER_BAD:
409 /*
410 * Page outside of range of object. Right now we
411 * essentially lose the changes by pretending it
412 * worked.
413 */
414 vm_page_busy_wait(mt, FALSE, "pgbad");
415 pmap_clear_modify(mt);
416 vm_page_undirty(mt);
417 vm_page_wakeup(mt);
418 break;
419 case VM_PAGER_ERROR:
420 case VM_PAGER_FAIL:
421 /*
422 * A page typically cannot be paged out when we
423 * have run out of swap. We leave the page
424 * marked inactive and will try to page it out
425 * again later.
426 *
427 * Starvation of the active page list is used to
428 * determine when the system is massively memory
429 * starved.
430 */
431 break;
432 case VM_PAGER_AGAIN:
433 break;
434 }
435
436 /*
437 * If the operation is still going, leave the page busy to
438 * block all other accesses. Also, leave the paging in
439 * progress indicator set so that we don't attempt an object
440 * collapse.
441 *
442 * For any pages which have completed synchronously,
443 * deactivate the page if we are under a severe deficit.
444 * Do not try to enter them into the cache, though, they
445 * might still be read-heavy.
446 */
447 if (pageout_status[i] != VM_PAGER_PEND) {
448 vm_page_busy_wait(mt, FALSE, "pgouw");
449 if (vm_page_count_severe())
450 vm_page_deactivate(mt);
451 #if 0
452 if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
453 vm_page_protect(mt, VM_PROT_READ);
454 #endif
455 vm_page_io_finish(mt);
456 vm_page_wakeup(mt);
457 vm_object_pip_wakeup(object);
458 }
459 }
460 return numpagedout;
461 }
462
463 #if !defined(NO_SWAPPING)
464 /*
465 * deactivate enough pages to satisfy the inactive target
466 * requirements or if vm_page_proc_limit is set, then
467 * deactivate all of the pages in the object and its
468 * backing_objects.
469 *
470 * The map must be locked.
471 * The caller must hold the vm_object.
472 */
473 static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
474
475 static void
476 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
477 vm_pindex_t desired, int map_remove_only)
478 {
479 struct rb_vm_page_scan_info info;
480 vm_object_t lobject;
481 vm_object_t tobject;
482 int remove_mode;
483
484 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
485 lobject = object;
486
487 while (lobject) {
488 if (pmap_resident_count(vm_map_pmap(map)) <= desired)
489 break;
490 if (lobject->type == OBJT_DEVICE ||
491 lobject->type == OBJT_MGTDEVICE ||
492 lobject->type == OBJT_PHYS)
493 break;
494 if (lobject->paging_in_progress)
495 break;
496
497 remove_mode = map_remove_only;
498 if (lobject->shadow_count > 1)
499 remove_mode = 1;
500
501 /*
502 * scan the objects entire memory queue. We hold the
503 * object's token so the scan should not race anything.
504 */
505 info.limit = remove_mode;
506 info.map = map;
507 info.desired = desired;
508 vm_page_rb_tree_RB_SCAN(&lobject->rb_memq, NULL,
509 vm_pageout_object_deactivate_pages_callback,
510 &info
511 );
512 while ((tobject = lobject->backing_object) != NULL) {
513 KKASSERT(tobject != object);
514 vm_object_hold(tobject);
515 if (tobject == lobject->backing_object)
516 break;
517 vm_object_drop(tobject);
518 }
519 if (lobject != object) {
520 if (tobject)
521 vm_object_lock_swap();
522 vm_object_drop(lobject);
523 /* leaves tobject locked & at top */
524 }
525 lobject = tobject;
526 }
527 if (lobject != object)
528 vm_object_drop(lobject); /* NULL ok */
529 }
530
531 /*
532 * The caller must hold the vm_object.
533 */
534 static int
535 vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
536 {
537 struct rb_vm_page_scan_info *info = data;
538 int actcount;
539
540 if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) {
541 return(-1);
542 }
543 mycpu->gd_cnt.v_pdpages++;
544
545 if (vm_page_busy_try(p, TRUE))
546 return(0);
547 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) {
548 vm_page_wakeup(p);
549 return(0);
550 }
551 if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
552 vm_page_wakeup(p);
553 return(0);
554 }
555
556 actcount = pmap_ts_referenced(p);
557 if (actcount) {
558 vm_page_flag_set(p, PG_REFERENCED);
559 } else if (p->flags & PG_REFERENCED) {
560 actcount = 1;
561 }
562
563 vm_page_and_queue_spin_lock(p);
564 if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
565 vm_page_and_queue_spin_unlock(p);
566 vm_page_activate(p);
567 p->act_count += actcount;
568 vm_page_flag_clear(p, PG_REFERENCED);
569 } else if (p->queue - p->pc == PQ_ACTIVE) {
570 if ((p->flags & PG_REFERENCED) == 0) {
571 p->act_count -= min(p->act_count, ACT_DECLINE);
572 if (!info->limit &&
573 (vm_pageout_algorithm || (p->act_count == 0))) {
574 vm_page_and_queue_spin_unlock(p);
575 vm_page_protect(p, VM_PROT_NONE);
576 vm_page_deactivate(p);
577 } else {
578 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
579 p, pageq);
580 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
581 p, pageq);
582 vm_page_and_queue_spin_unlock(p);
583 }
584 } else {
585 vm_page_and_queue_spin_unlock(p);
586 vm_page_activate(p);
587 vm_page_flag_clear(p, PG_REFERENCED);
588
589 vm_page_and_queue_spin_lock(p);
590 if (p->queue - p->pc == PQ_ACTIVE) {
591 if (p->act_count < (ACT_MAX - ACT_ADVANCE))
592 p->act_count += ACT_ADVANCE;
593 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
594 p, pageq);
595 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
596 p, pageq);
597 }
598 vm_page_and_queue_spin_unlock(p);
599 }
600 } else if (p->queue - p->pc == PQ_INACTIVE) {
601 vm_page_and_queue_spin_unlock(p);
602 vm_page_protect(p, VM_PROT_NONE);
603 } else {
604 vm_page_and_queue_spin_unlock(p);
605 }
606 vm_page_wakeup(p);
607 return(0);
608 }
609
610 /*
611 * Deactivate some number of pages in a map, try to do it fairly, but
612 * that is really hard to do.
613 */
614 static void
615 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired)
616 {
617 vm_map_entry_t tmpe;
618 vm_object_t obj, bigobj;
619 int nothingwired;
620
621 if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT)) {
622 return;
623 }
624
625 bigobj = NULL;
626 nothingwired = TRUE;
627
628 /*
629 * first, search out the biggest object, and try to free pages from
630 * that.
631 */
632 tmpe = map->header.next;
633 while (tmpe != &map->header) {
634 switch(tmpe->maptype) {
635 case VM_MAPTYPE_NORMAL:
636 case VM_MAPTYPE_VPAGETABLE:
637 obj = tmpe->object.vm_object;
638 if ((obj != NULL) && (obj->shadow_count <= 1) &&
639 ((bigobj == NULL) ||
640 (bigobj->resident_page_count < obj->resident_page_count))) {
641 bigobj = obj;
642 }
643 break;
644 default:
645 break;
646 }
647 if (tmpe->wired_count > 0)
648 nothingwired = FALSE;
649 tmpe = tmpe->next;
650 }
651
652 if (bigobj) {
653 vm_object_hold(bigobj);
654 vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
655 vm_object_drop(bigobj);
656 }
657
658 /*
659 * Next, hunt around for other pages to deactivate. We actually
660 * do this search sort of wrong -- .text first is not the best idea.
661 */
662 tmpe = map->header.next;
663 while (tmpe != &map->header) {
664 if (pmap_resident_count(vm_map_pmap(map)) <= desired)
665 break;
666 switch(tmpe->maptype) {
667 case VM_MAPTYPE_NORMAL:
668 case VM_MAPTYPE_VPAGETABLE:
669 obj = tmpe->object.vm_object;
670 if (obj) {
671 vm_object_hold(obj);
672 vm_pageout_object_deactivate_pages(map, obj, desired, 0);
673 vm_object_drop(obj);
674 }
675 break;
676 default:
677 break;
678 }
679 tmpe = tmpe->next;
680 }
681
682 /*
683 * Remove all mappings if a process is swapped out, this will free page
684 * table pages.
685 */
686 if (desired == 0 && nothingwired)
687 pmap_remove(vm_map_pmap(map),
688 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
689 vm_map_unlock(map);
690 }
691 #endif
692
693 /*
694 * Called when the pageout scan wants to free a page. We no longer
695 * try to cycle the vm_object here with a reference & dealloc, which can
696 * cause a non-trivial object collapse in a critical path.
697 *
698 * It is unclear why we cycled the ref_count in the past, perhaps to try
699 * to optimize shadow chain collapses but I don't quite see why it would
700 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages
701 * synchronously and not have to be kicked-start.
702 */
703 static void
704 vm_pageout_page_free(vm_page_t m)
705 {
706 vm_page_protect(m, VM_PROT_NONE);
707 vm_page_free(m);
708 }
709
710 /*
711 * vm_pageout_scan does the dirty work for the pageout daemon.
712 */
713 struct vm_pageout_scan_info {
714 struct proc *bigproc;
715 vm_offset_t bigsize;
716 };
717
718 static int vm_pageout_scan_callback(struct proc *p, void *data);
719
720 static int
721 vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
722 int *vnodes_skippedp)
723 {
724 vm_page_t m;
725 struct vm_page marker;
726 struct vnode *vpfailed; /* warning, allowed to be stale */
727 int maxscan;
728 int count;
729 int delta = 0;
730 vm_object_t object;
731 int actcount;
732 int maxlaunder;
733
734 /*
735 * Start scanning the inactive queue for pages we can move to the
736 * cache or free. The scan will stop when the target is reached or
737 * we have scanned the entire inactive queue. Note that m->act_count
738 * is not used to form decisions for the inactive queue, only for the
739 * active queue.
740 *
741 * maxlaunder limits the number of dirty pages we flush per scan.
742 * For most systems a smaller value (16 or 32) is more robust under
743 * extreme memory and disk pressure because any unnecessary writes
744 * to disk can result in extreme performance degredation. However,
745 * systems with excessive dirty pages (especially when MAP_NOSYNC is
746 * used) will die horribly with limited laundering. If the pageout
747 * daemon cannot clean enough pages in the first pass, we let it go
748 * all out in succeeding passes.
749 */
750 if ((maxlaunder = vm_max_launder) <= 1)
751 maxlaunder = 1;
752 if (pass)
753 maxlaunder = 10000;
754
755 /*
756 * Initialize our marker
757 */
758 bzero(&marker, sizeof(marker));
759 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
760 marker.queue = PQ_INACTIVE + q;
761 marker.pc = q;
762 marker.wire_count = 1;
763
764 /*
765 * Inactive queue scan.
766 *
767 * NOTE: The vm_page must be spinlocked before the queue to avoid
768 * deadlocks, so it is easiest to simply iterate the loop
769 * with the queue unlocked at the top.
770 */
771 vpfailed = NULL;
772
773 vm_page_queues_spin_lock(PQ_INACTIVE + q);
774 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
775 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt;
776 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
777
778 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
779 maxscan-- > 0 && avail_shortage - delta > 0)
780 {
781 vm_page_and_queue_spin_lock(m);
782 if (m != TAILQ_NEXT(&marker, pageq)) {
783 vm_page_and_queue_spin_unlock(m);
784 ++maxscan;
785 continue;
786 }
787 KKASSERT(m->queue - m->pc == PQ_INACTIVE);
788 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
789 &marker, pageq);
790 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
791 &marker, pageq);
792 mycpu->gd_cnt.v_pdpages++;
793
794 /*
795 * Skip marker pages
796 */
797 if (m->flags & PG_MARKER) {
798 vm_page_and_queue_spin_unlock(m);
799 continue;
800 }
801
802 /*
803 * Try to busy the page. Don't mess with pages which are
804 * already busy or reorder them in the queue.
805 */
806 if (vm_page_busy_try(m, TRUE)) {
807 vm_page_and_queue_spin_unlock(m);
808 continue;
809 }
810 vm_page_and_queue_spin_unlock(m);
811 KKASSERT(m->queue - m->pc == PQ_INACTIVE);
812
813 lwkt_yield();
814
815 /*
816 * The page has been successfully busied and is now no
817 * longer spinlocked. The queue is no longer spinlocked
818 * either.
819 */
820
821 /*
822 * It is possible for a page to be busied ad-hoc (e.g. the
823 * pmap_collect() code) and wired and race against the
824 * allocation of a new page. vm_page_alloc() may be forced
825 * to deactivate the wired page in which case it winds up
826 * on the inactive queue and must be handled here. We
827 * correct the problem simply by unqueuing the page.
828 */
829 if (m->wire_count) {
830 vm_page_unqueue_nowakeup(m);
831 vm_page_wakeup(m);
832 kprintf("WARNING: pagedaemon: wired page on "
833 "inactive queue %p\n", m);
834 continue;
835 }
836
837 /*
838 * A held page may be undergoing I/O, so skip it.
839 */
840 if (m->hold_count) {
841 vm_page_and_queue_spin_lock(m);
842 if (m->queue - m->pc == PQ_INACTIVE) {
843 TAILQ_REMOVE(
844 &vm_page_queues[PQ_INACTIVE + q].pl,
845 m, pageq);
846 TAILQ_INSERT_TAIL(
847 &vm_page_queues[PQ_INACTIVE + q].pl,
848 m, pageq);
849 ++vm_swapcache_inactive_heuristic;
850 }
851 vm_page_and_queue_spin_unlock(m);
852 vm_page_wakeup(m);
853 continue;
854 }
855
856 if (m->object == NULL || m->object->ref_count == 0) {
857 /*
858 * If the object is not being used, we ignore previous
859 * references.
860 */
861 vm_page_flag_clear(m, PG_REFERENCED);
862 pmap_clear_reference(m);
863 /* fall through to end */
864 } else if (((m->flags & PG_REFERENCED) == 0) &&
865 (actcount = pmap_ts_referenced(m))) {
866 /*
867 * Otherwise, if the page has been referenced while
868 * in the inactive queue, we bump the "activation
869 * count" upwards, making it less likely that the
870 * page will be added back to the inactive queue
871 * prematurely again. Here we check the page tables
872 * (or emulated bits, if any), given the upper level
873 * VM system not knowing anything about existing
874 * references.
875 */
876 vm_page_activate(m);
877 m->act_count += (actcount + ACT_ADVANCE);
878 vm_page_wakeup(m);
879 continue;
880 }
881
882 /*
883 * (m) is still busied.
884 *
885 * If the upper level VM system knows about any page
886 * references, we activate the page. We also set the
887 * "activation count" higher than normal so that we will less
888 * likely place pages back onto the inactive queue again.
889 */
890 if ((m->flags & PG_REFERENCED) != 0) {
891 vm_page_flag_clear(m, PG_REFERENCED);
892 actcount = pmap_ts_referenced(m);
893 vm_page_activate(m);
894 m->act_count += (actcount + ACT_ADVANCE + 1);
895 vm_page_wakeup(m);
896 continue;
897 }
898
899 /*
900 * If the upper level VM system doesn't know anything about
901 * the page being dirty, we have to check for it again. As
902 * far as the VM code knows, any partially dirty pages are
903 * fully dirty.
904 *
905 * Pages marked PG_WRITEABLE may be mapped into the user
906 * address space of a process running on another cpu. A
907 * user process (without holding the MP lock) running on
908 * another cpu may be able to touch the page while we are
909 * trying to remove it. vm_page_cache() will handle this
910 * case for us.
911 */
912 if (m->dirty == 0) {
913 vm_page_test_dirty(m);
914 } else {
915 vm_page_dirty(m);
916 }
917
918 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
919 /*
920 * Invalid pages can be easily freed
921 */
922 vm_pageout_page_free(m);
923 mycpu->gd_cnt.v_dfree++;
924 ++delta;
925 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
926 /*
927 * Clean pages can be placed onto the cache queue.
928 * This effectively frees them.
929 */
930 vm_page_cache(m);
931 ++delta;
932 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
933 /*
934 * Dirty pages need to be paged out, but flushing
935 * a page is extremely expensive verses freeing
936 * a clean page. Rather then artificially limiting
937 * the number of pages we can flush, we instead give
938 * dirty pages extra priority on the inactive queue
939 * by forcing them to be cycled through the queue
940 * twice before being flushed, after which the
941 * (now clean) page will cycle through once more
942 * before being freed. This significantly extends
943 * the thrash point for a heavily loaded machine.
944 */
945 vm_page_flag_set(m, PG_WINATCFLS);
946 vm_page_and_queue_spin_lock(m);
947 if (m->queue - m->pc == PQ_INACTIVE) {
948 TAILQ_REMOVE(
949 &vm_page_queues[PQ_INACTIVE + q].pl,
950 m, pageq);
951 TAILQ_INSERT_TAIL(
952 &vm_page_queues[PQ_INACTIVE + q].pl,
953 m, pageq);
954 ++vm_swapcache_inactive_heuristic;
955 }
956 vm_page_and_queue_spin_unlock(m);
957 vm_page_wakeup(m);
958 } else if (maxlaunder > 0) {
959 /*
960 * We always want to try to flush some dirty pages if
961 * we encounter them, to keep the system stable.
962 * Normally this number is small, but under extreme
963 * pressure where there are insufficient clean pages
964 * on the inactive queue, we may have to go all out.
965 */
966 int swap_pageouts_ok;
967 struct vnode *vp = NULL;
968
969 swap_pageouts_ok = 0;
970 object = m->object;
971 if (object &&
972 (object->type != OBJT_SWAP) &&
973 (object->type != OBJT_DEFAULT)) {
974 swap_pageouts_ok = 1;
975 } else {
976 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
977 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
978 vm_page_count_min(0));
979
980 }
981
982 /*
983 * We don't bother paging objects that are "dead".
984 * Those objects are in a "rundown" state.
985 */
986 if (!swap_pageouts_ok ||
987 (object == NULL) ||
988 (object->flags & OBJ_DEAD)) {
989 vm_page_and_queue_spin_lock(m);
990 if (m->queue - m->pc == PQ_INACTIVE) {
991 TAILQ_REMOVE(
992 &vm_page_queues[PQ_INACTIVE + q].pl,
993 m, pageq);
994 TAILQ_INSERT_TAIL(
995 &vm_page_queues[PQ_INACTIVE + q].pl,
996 m, pageq);
997 ++vm_swapcache_inactive_heuristic;
998 }
999 vm_page_and_queue_spin_unlock(m);
1000 vm_page_wakeup(m);
1001 continue;
1002 }
1003
1004 /*
1005 * (m) is still busied.
1006 *
1007 * The object is already known NOT to be dead. It
1008 * is possible for the vget() to block the whole
1009 * pageout daemon, but the new low-memory handling
1010 * code should prevent it.
1011 *
1012 * The previous code skipped locked vnodes and, worse,
1013 * reordered pages in the queue. This results in
1014 * completely non-deterministic operation because,
1015 * quite often, a vm_fault has initiated an I/O and
1016 * is holding a locked vnode at just the point where
1017 * the pageout daemon is woken up.
1018 *
1019 * We can't wait forever for the vnode lock, we might
1020 * deadlock due to a vn_read() getting stuck in
1021 * vm_wait while holding this vnode. We skip the
1022 * vnode if we can't get it in a reasonable amount
1023 * of time.
1024 *
1025 * vpfailed is used to (try to) avoid the case where
1026 * a large number of pages are associated with a
1027 * locked vnode, which could cause the pageout daemon
1028 * to stall for an excessive amount of time.
1029 */
1030 if (object->type == OBJT_VNODE) {
1031 int flags;
1032
1033 vp = object->handle;
1034 flags = LK_EXCLUSIVE;
1035 if (vp == vpfailed)
1036 flags |= LK_NOWAIT;
1037 else
1038 flags |= LK_TIMELOCK;
1039 vm_page_hold(m);
1040 vm_page_wakeup(m);
1041
1042 /*
1043 * We have unbusied (m) temporarily so we can
1044 * acquire the vp lock without deadlocking.
1045 * (m) is held to prevent destruction.
1046 */
1047 if (vget(vp, flags) != 0) {
1048 vpfailed = vp;
1049 ++pageout_lock_miss;
1050 if (object->flags & OBJ_MIGHTBEDIRTY)
1051 ++*vnodes_skippedp;
1052 vm_page_unhold(m);
1053 continue;
1054 }
1055
1056 /*
1057 * The page might have been moved to another
1058 * queue during potential blocking in vget()
1059 * above. The page might have been freed and
1060 * reused for another vnode. The object might
1061 * have been reused for another vnode.
1062 */
1063 if (m->queue - m->pc != PQ_INACTIVE ||
1064 m->object != object ||
1065 object->handle != vp) {
1066 if (object->flags & OBJ_MIGHTBEDIRTY)
1067 ++*vnodes_skippedp;
1068 vput(vp);
1069 vm_page_unhold(m);
1070 continue;
1071 }
1072
1073 /*
1074 * The page may have been busied during the
1075 * blocking in vput(); We don't move the
1076 * page back onto the end of the queue so that
1077 * statistics are more correct if we don't.
1078 */
1079 if (vm_page_busy_try(m, TRUE)) {
1080 vput(vp);
1081 vm_page_unhold(m);
1082 continue;
1083 }
1084 vm_page_unhold(m);
1085
1086 /*
1087 * (m) is busied again
1088 *
1089 * We own the busy bit and remove our hold
1090 * bit. If the page is still held it
1091 * might be undergoing I/O, so skip it.
1092 */
1093 if (m->hold_count) {
1094 vm_page_and_queue_spin_lock(m);
1095 if (m->queue - m->pc == PQ_INACTIVE) {
1096 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
1097 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
1098 ++vm_swapcache_inactive_heuristic;
1099 }
1100 vm_page_and_queue_spin_unlock(m);
1101 if (object->flags & OBJ_MIGHTBEDIRTY)
1102 ++*vnodes_skippedp;
1103 vm_page_wakeup(m);
1104 vput(vp);
1105 continue;
1106 }
1107 /* (m) is left busied as we fall through */
1108 }
1109
1110 /*
1111 * page is busy and not held here.
1112 *
1113 * If a page is dirty, then it is either being washed
1114 * (but not yet cleaned) or it is still in the
1115 * laundry. If it is still in the laundry, then we
1116 * start the cleaning operation.
1117 *
1118 * decrement inactive_shortage on success to account
1119 * for the (future) cleaned page. Otherwise we
1120 * could wind up laundering or cleaning too many
1121 * pages.
1122 */
1123 count = vm_pageout_clean(m);
1124 delta += count;
1125 maxlaunder -= count;
1126
1127 /*
1128 * Clean ate busy, page no longer accessible
1129 */
1130 if (vp != NULL)
1131 vput(vp);
1132 } else {
1133 vm_page_wakeup(m);
1134 }
1135
1136 /*
1137 * Systems with a ton of memory can wind up with huge
1138 * deactivation counts. Because the inactive scan is
1139 * doing a lot of flushing, the combination can result
1140 * in excessive paging even in situations where other
1141 * unrelated threads free up sufficient VM.
1142 *
1143 * To deal with this we abort the nominal active->inactive
1144 * scan before we hit the inactive target when free+cache
1145 * levels have already reached their target.
1146 *
1147 * Note that nominally the inactive scan is not freeing or
1148 * caching pages, it is deactivating active pages, so it
1149 * will not by itself cause the abort condition.
1150 */
1151 if (vm_paging_target() < 0)
1152 break;
1153 }
1154 vm_page_queues_spin_lock(PQ_INACTIVE + q);
1155 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
1156 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1157
1158 return (delta);
1159 }
1160
1161 static int
1162 vm_pageout_scan_active(int pass, int q,
1163 int avail_shortage, int inactive_shortage,
1164 int *recycle_countp)
1165 {
1166 struct vm_page marker;
1167 vm_page_t m;
1168 int actcount;
1169 int delta = 0;
1170 int maxscan;
1171
1172 /*
1173 * We want to move pages from the active queue to the inactive
1174 * queue to get the inactive queue to the inactive target. If
1175 * we still have a page shortage from above we try to directly free
1176 * clean pages instead of moving them.
1177 *
1178 * If we do still have a shortage we keep track of the number of
1179 * pages we free or cache (recycle_count) as a measure of thrashing
1180 * between the active and inactive queues.
1181 *
1182 * If we were able to completely satisfy the free+cache targets
1183 * from the inactive pool we limit the number of pages we move
1184 * from the active pool to the inactive pool to 2x the pages we
1185 * had removed from the inactive pool (with a minimum of 1/5 the
1186 * inactive target). If we were not able to completely satisfy
1187 * the free+cache targets we go for the whole target aggressively.
1188 *
1189 * NOTE: Both variables can end up negative.
1190 * NOTE: We are still in a critical section.
1191 */
1192
1193 bzero(&marker, sizeof(marker));
1194 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1195 marker.queue = PQ_ACTIVE + q;
1196 marker.pc = q;
1197 marker.wire_count = 1;
1198
1199 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1200 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1201 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt;
1202 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1203
1204 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1205 maxscan-- > 0 && (avail_shortage - delta > 0 ||
1206 inactive_shortage > 0))
1207 {
1208 vm_page_and_queue_spin_lock(m);
1209 if (m != TAILQ_NEXT(&marker, pageq)) {
1210 vm_page_and_queue_spin_unlock(m);
1211 ++maxscan;
1212 continue;
1213 }
1214 KKASSERT(m->queue - m->pc == PQ_ACTIVE);
1215 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1216 &marker, pageq);
1217 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1218 &marker, pageq);
1219
1220 /*
1221 * Skip marker pages
1222 */
1223 if (m->flags & PG_MARKER) {
1224 vm_page_and_queue_spin_unlock(m);
1225 continue;
1226 }
1227
1228 /*
1229 * Try to busy the page. Don't mess with pages which are
1230 * already busy or reorder them in the queue.
1231 */
1232 if (vm_page_busy_try(m, TRUE)) {
1233 vm_page_and_queue_spin_unlock(m);
1234 continue;
1235 }
1236
1237 /*
1238 * Don't deactivate pages that are held, even if we can
1239 * busy them. (XXX why not?)
1240 */
1241 if (m->hold_count != 0) {
1242 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1243 m, pageq);
1244 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE + q].pl,
1245 m, pageq);
1246 vm_page_and_queue_spin_unlock(m);
1247 vm_page_wakeup(m);
1248 continue;
1249 }
1250 vm_page_and_queue_spin_unlock(m);
1251 lwkt_yield();
1252
1253 /*
1254 * The page has been successfully busied and the page and
1255 * queue are no longer locked.
1256 */
1257
1258 /*
1259 * The count for pagedaemon pages is done after checking the
1260 * page for eligibility...
1261 */
1262 mycpu->gd_cnt.v_pdpages++;
1263
1264 /*
1265 * Check to see "how much" the page has been used and clear
1266 * the tracking access bits. If the object has no references
1267 * don't bother paying the expense.
1268 */
1269 actcount = 0;
1270 if (m->object && m->object->ref_count != 0) {
1271 if (m->flags & PG_REFERENCED)
1272 ++actcount;
1273 actcount += pmap_ts_referenced(m);
1274 if (actcount) {
1275 m->act_count += ACT_ADVANCE + actcount;
1276 if (m->act_count > ACT_MAX)
1277 m->act_count = ACT_MAX;
1278 }
1279 }
1280 vm_page_flag_clear(m, PG_REFERENCED);
1281
1282 /*
1283 * actcount is only valid if the object ref_count is non-zero.
1284 * If the page does not have an object, actcount will be zero.
1285 */
1286 if (actcount && m->object->ref_count != 0) {
1287 vm_page_and_queue_spin_lock(m);
1288 if (m->queue - m->pc == PQ_ACTIVE) {
1289 TAILQ_REMOVE(
1290 &vm_page_queues[PQ_ACTIVE + q].pl,
1291 m, pageq);
1292 TAILQ_INSERT_TAIL(
1293 &vm_page_queues[PQ_ACTIVE + q].pl,
1294 m, pageq);
1295 }
1296 vm_page_and_queue_spin_unlock(m);
1297 vm_page_wakeup(m);
1298 } else {
1299 switch(m->object->type) {
1300 case OBJT_DEFAULT:
1301 case OBJT_SWAP:
1302 m->act_count -= min(m->act_count,
1303 vm_anonmem_decline);
1304 break;
1305 default:
1306 m->act_count -= min(m->act_count,
1307 vm_filemem_decline);
1308 break;
1309 }
1310 if (vm_pageout_algorithm ||
1311 (m->object == NULL) ||
1312 (m->object && (m->object->ref_count == 0)) ||
1313 m->act_count < pass + 1
1314 ) {
1315 /*
1316 * Deactivate the page. If we had a
1317 * shortage from our inactive scan try to
1318 * free (cache) the page instead.
1319 *
1320 * Don't just blindly cache the page if
1321 * we do not have a shortage from the
1322 * inactive scan, that could lead to
1323 * gigabytes being moved.
1324 */
1325 --inactive_shortage;
1326 if (avail_shortage - delta > 0 ||
1327 (m->object && (m->object->ref_count == 0)))
1328 {
1329 if (avail_shortage - delta > 0)
1330 ++*recycle_countp;
1331 vm_page_protect(m, VM_PROT_NONE);
1332 if (m->dirty == 0 &&
1333 (m->flags & PG_NEED_COMMIT) == 0 &&
1334 avail_shortage - delta > 0) {
1335 vm_page_cache(m);
1336 } else {
1337 vm_page_deactivate(m);
1338 vm_page_wakeup(m);
1339 }
1340 } else {
1341 vm_page_deactivate(m);
1342 vm_page_wakeup(m);
1343 }
1344 ++delta;
1345 } else {
1346 vm_page_and_queue_spin_lock(m);
1347 if (m->queue - m->pc == PQ_ACTIVE) {
1348 TAILQ_REMOVE(
1349 &vm_page_queues[PQ_ACTIVE + q].pl,
1350 m, pageq);
1351 TAILQ_INSERT_TAIL(
1352 &vm_page_queues[PQ_ACTIVE + q].pl,
1353 m, pageq);
1354 }
1355 vm_page_and_queue_spin_unlock(m);
1356 vm_page_wakeup(m);
1357 }
1358 }
1359 }
1360
1361 /*
1362 * Clean out our local marker.
1363 */
1364 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1365 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1366 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1367
1368 return (delta);
1369 }
1370
1371 /*
1372 * The number of actually free pages can drop down to v_free_reserved,
1373 * we try to build the free count back above v_free_min. Note that
1374 * vm_paging_needed() also returns TRUE if v_free_count is not at
1375 * least v_free_min so that is the minimum we must build the free
1376 * count to.
1377 *
1378 * We use a slightly higher target to improve hysteresis,
1379 * ((v_free_target + v_free_min) / 2). Since v_free_target
1380 * is usually the same as v_cache_min this maintains about
1381 * half the pages in the free queue as are in the cache queue,
1382 * providing pretty good pipelining for pageout operation.
1383 *
1384 * The system operator can manipulate vm.v_cache_min and
1385 * vm.v_free_target to tune the pageout demon. Be sure
1386 * to keep vm.v_free_min < vm.v_free_target.
1387 *
1388 * Note that the original paging target is to get at least
1389 * (free_min + cache_min) into (free + cache). The slightly
1390 * higher target will shift additional pages from cache to free
1391 * without effecting the original paging target in order to
1392 * maintain better hysteresis and not have the free count always
1393 * be dead-on v_free_min.
1394 *
1395 * NOTE: we are still in a critical section.
1396 *
1397 * Pages moved from PQ_CACHE to totally free are not counted in the
1398 * pages_freed counter.
1399 */
1400 static void
1401 vm_pageout_scan_cache(int avail_shortage, int vnodes_skipped, int recycle_count)
1402 {
1403 struct vm_pageout_scan_info info;
1404 vm_page_t m;
1405
1406 while (vmstats.v_free_count <
1407 (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1408 /*
1409 * This steals some code from vm/vm_page.c
1410 */
1411 static int cache_rover = 0;
1412
1413 m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK, FALSE);
1414 if (m == NULL)
1415 break;
1416 /* page is returned removed from its queue and spinlocked */
1417 if (vm_page_busy_try(m, TRUE)) {
1418 vm_page_deactivate_locked(m);
1419 vm_page_spin_unlock(m);
1420 continue;
1421 }
1422 vm_page_spin_unlock(m);
1423 pagedaemon_wakeup();
1424 lwkt_yield();
1425
1426 /*
1427 * Page has been successfully busied and it and its queue
1428 * is no longer spinlocked.
1429 */
1430 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1431 m->hold_count ||
1432 m->wire_count) {
1433 vm_page_deactivate(m);
1434 vm_page_wakeup(m);
1435 continue;
1436 }
1437 KKASSERT((m->flags & PG_MAPPED) == 0);
1438 KKASSERT(m->dirty == 0);
1439 cache_rover += PQ_PRIME2;
1440 vm_pageout_page_free(m);
1441 mycpu->gd_cnt.v_dfree++;
1442 }
1443
1444 #if !defined(NO_SWAPPING)
1445 /*
1446 * Idle process swapout -- run once per second.
1447 */
1448 if (vm_swap_idle_enabled) {
1449 static time_t lsec;
1450 if (time_uptime != lsec) {
1451 vm_pageout_req_swapout |= VM_SWAP_IDLE;
1452 vm_req_vmdaemon();
1453 lsec = time_uptime;
1454 }
1455 }
1456 #endif
1457
1458 /*
1459 * If we didn't get enough free pages, and we have skipped a vnode
1460 * in a writeable object, wakeup the sync daemon. And kick swapout
1461 * if we did not get enough free pages.
1462 */
1463 if (vm_paging_target() > 0) {
1464 if (vnodes_skipped && vm_page_count_min(0))
1465 speedup_syncer(NULL);
1466 #if !defined(NO_SWAPPING)
1467 if (vm_swap_enabled && vm_page_count_target()) {
1468 vm_req_vmdaemon();
1469 vm_pageout_req_swapout |= VM_SWAP_NORMAL;
1470 }
1471 #endif
1472 }
1473
1474 /*
1475 * Handle catastrophic conditions. Under good conditions we should
1476 * be at the target, well beyond our minimum. If we could not even
1477 * reach our minimum the system is under heavy stress.
1478 *
1479 * Determine whether we have run out of memory. This occurs when
1480 * swap_pager_full is TRUE and the only pages left in the page
1481 * queues are dirty. We will still likely have page shortages.
1482 *
1483 * - swap_pager_full is set if insufficient swap was
1484 * available to satisfy a requested pageout.
1485 *
1486 * - the inactive queue is bloated (4 x size of active queue),
1487 * meaning it is unable to get rid of dirty pages and.
1488 *
1489 * - vm_page_count_min() without counting pages recycled from the
1490 * active queue (recycle_count) means we could not recover
1491 * enough pages to meet bare minimum needs. This test only
1492 * works if the inactive queue is bloated.
1493 *
1494 * - due to a positive avail_shortage we shifted the remaining
1495 * dirty pages from the active queue to the inactive queue
1496 * trying to find clean ones to free.
1497 */
1498 if (swap_pager_full && vm_page_count_min(recycle_count))
1499 kprintf("Warning: system low on memory+swap!\n");
1500 if (swap_pager_full && vm_page_count_min(recycle_count) &&
1501 vmstats.v_inactive_count > vmstats.v_active_count * 4 &&
1502 avail_shortage > 0) {
1503 /*
1504 * Kill something.
1505 */
1506 info.bigproc = NULL;
1507 info.bigsize = 0;
1508 allproc_scan(vm_pageout_scan_callback, &info);
1509 if (info.bigproc != NULL) {
1510 killproc(info.bigproc, "out of swap space");
1511 info.bigproc->p_nice = PRIO_MIN;
1512 info.bigproc->p_usched->resetpriority(
1513 FIRST_LWP_IN_PROC(info.bigproc));
1514 wakeup(&vmstats.v_free_count);
1515 PRELE(info.bigproc);
1516 }
1517 }
1518 }
1519
1520 static int
1521 vm_pageout_scan_callback(struct proc *p, void *data)
1522 {
1523 struct vm_pageout_scan_info *info = data;
1524 vm_offset_t size;
1525
1526 /*
1527 * Never kill system processes or init. If we have configured swap
1528 * then try to avoid killing low-numbered pids.
1529 */
1530 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1531 ((p->p_pid < 48) && (vm_swap_size != 0))) {
1532 return (0);
1533 }
1534
1535 lwkt_gettoken(&p->p_token);
1536
1537 /*
1538 * if the process is in a non-running type state,
1539 * don't touch it.
1540 */
1541 if (p->p_stat != SACTIVE && p->p_stat != SSTOP) {
1542 lwkt_reltoken(&p->p_token);
1543 return (0);
1544 }
1545
1546 /*
1547 * Get the approximate process size. Note that anonymous pages
1548 * with backing swap will be counted twice, but there should not
1549 * be too many such pages due to the stress the VM system is
1550 * under at this point.
1551 */
1552 size = vmspace_anonymous_count(p->p_vmspace) +
1553 vmspace_swap_count(p->p_vmspace);
1554
1555 /*
1556 * If the this process is bigger than the biggest one
1557 * remember it.
1558 */
1559 if (info->bigsize < size) {
1560 if (info->bigproc)
1561 PRELE(info->bigproc);
1562 PHOLD(p);
1563 info->bigproc = p;
1564 info->bigsize = size;
1565 }
1566 lwkt_reltoken(&p->p_token);
1567 lwkt_yield();
1568
1569 return(0);
1570 }
1571
1572 /*
1573 * This routine tries to maintain the pseudo LRU active queue,
1574 * so that during long periods of time where there is no paging,
1575 * that some statistic accumulation still occurs. This code
1576 * helps the situation where paging just starts to occur.
1577 */
1578 static void
1579 vm_pageout_page_stats(int q)
1580 {
1581 static int fullintervalcount = 0;
1582 struct vm_page marker;
1583 vm_page_t m;
1584 int pcount, tpcount; /* Number of pages to check */
1585 int page_shortage;
1586
1587 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1588 vmstats.v_free_min) -
1589 (vmstats.v_free_count + vmstats.v_inactive_count +
1590 vmstats.v_cache_count);
1591
1592 if (page_shortage <= 0)
1593 return;
1594
1595 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1596 fullintervalcount += vm_pageout_stats_interval;
1597 if (fullintervalcount < vm_pageout_full_stats_interval) {
1598 tpcount = (vm_pageout_stats_max * pcount) /
1599 vmstats.v_page_count + 1;
1600 if (pcount > tpcount)
1601 pcount = tpcount;
1602 } else {
1603 fullintervalcount = 0;
1604 }
1605
1606 bzero(&marker, sizeof(marker));
1607 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1608 marker.queue = PQ_ACTIVE + q;
1609 marker.pc = q;
1610 marker.wire_count = 1;
1611
1612 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1613 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1614 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1615
1616 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1617 pcount-- > 0)
1618 {
1619 int actcount;
1620
1621 vm_page_and_queue_spin_lock(m);
1622 if (m != TAILQ_NEXT(&marker, pageq)) {
1623 vm_page_and_queue_spin_unlock(m);
1624 ++pcount;
1625 continue;
1626 }
1627 KKASSERT(m->queue - m->pc == PQ_ACTIVE);
1628 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1629 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1630 &marker, pageq);
1631
1632 /*
1633 * Ignore markers
1634 */
1635 if (m->flags & PG_MARKER) {
1636 vm_page_and_queue_spin_unlock(m);
1637 continue;
1638 }
1639
1640 /*
1641 * Ignore pages we can't busy
1642 */
1643 if (vm_page_busy_try(m, TRUE)) {
1644 vm_page_and_queue_spin_unlock(m);
1645 continue;
1646 }
1647 vm_page_and_queue_spin_unlock(m);
1648 KKASSERT(m->queue - m->pc == PQ_ACTIVE);
1649
1650 /*
1651 * We now have a safely busied page, the page and queue
1652 * spinlocks have been released.
1653 *
1654 * Ignore held pages
1655 */
1656 if (m->hold_count) {
1657 vm_page_wakeup(m);
1658 continue;
1659 }
1660
1661 /*
1662 * Calculate activity
1663 */
1664 actcount = 0;
1665 if (m->flags & PG_REFERENCED) {
1666 vm_page_flag_clear(m, PG_REFERENCED);
1667 actcount += 1;
1668 }
1669 actcount += pmap_ts_referenced(m);
1670
1671 /*
1672 * Update act_count and move page to end of queue.
1673 */
1674 if (actcount) {
1675 m->act_count += ACT_ADVANCE + actcount;
1676 if (m->act_count > ACT_MAX)
1677 m->act_count = ACT_MAX;
1678 vm_page_and_queue_spin_lock(m);
1679 if (m->queue - m->pc == PQ_ACTIVE) {
1680 TAILQ_REMOVE(
1681 &vm_page_queues[PQ_ACTIVE + q].pl,
1682 m, pageq);
1683 TAILQ_INSERT_TAIL(
1684 &vm_page_queues[PQ_ACTIVE + q].pl,
1685 m, pageq);
1686 }
1687 vm_page_and_queue_spin_unlock(m);
1688 vm_page_wakeup(m);
1689 continue;
1690 }
1691
1692 if (m->act_count == 0) {
1693 /*
1694 * We turn off page access, so that we have
1695 * more accurate RSS stats. We don't do this
1696 * in the normal page deactivation when the
1697 * system is loaded VM wise, because the
1698 * cost of the large number of page protect
1699 * operations would be higher than the value
1700 * of doing the operation.
1701 *
1702 * We use the marker to save our place so
1703 * we can release the spin lock. both (m)
1704 * and (next) will be invalid.
1705 */
1706 vm_page_protect(m, VM_PROT_NONE);
1707 vm_page_deactivate(m);
1708 } else {
1709 m->act_count -= min(m->act_count, ACT_DECLINE);
1710 vm_page_and_queue_spin_lock(m);
1711 if (m->queue - m->pc == PQ_ACTIVE) {
1712 TAILQ_REMOVE(
1713 &vm_page_queues[PQ_ACTIVE + q].pl,
1714 m, pageq);
1715 TAILQ_INSERT_TAIL(
1716 &vm_page_queues[PQ_ACTIVE + q].pl,
1717 m, pageq);
1718 }
1719 vm_page_and_queue_spin_unlock(m);
1720 }
1721 vm_page_wakeup(m);
1722 }
1723
1724 /*
1725 * Remove our local marker
1726 */
1727 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1728 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1729 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1730 }
1731
1732 static int
1733 vm_pageout_free_page_calc(vm_size_t count)
1734 {
1735 if (count < vmstats.v_page_count)
1736 return 0;
1737 /*
1738 * free_reserved needs to include enough for the largest swap pager
1739 * structures plus enough for any pv_entry structs when paging.
1740 *
1741 * v_free_min normal allocations
1742 * v_free_reserved system allocations
1743 * v_pageout_free_min allocations by pageout daemon
1744 * v_interrupt_free_min low level allocations (e.g swap structures)
1745 */
1746 if (vmstats.v_page_count > 1024)
1747 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200;
1748 else
1749 vmstats.v_free_min = 64;
1750 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
1751 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
1752 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
1753 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
1754
1755 return 1;
1756 }
1757
1758
1759 /*
1760 * vm_pageout is the high level pageout daemon.
1761 *
1762 * No requirements.
1763 */
1764 static void
1765 vm_pageout_thread(void)
1766 {
1767 int pass;
1768 int q;
1769 int q1iterator = 0;
1770 int q2iterator = 0;
1771
1772 /*
1773 * Initialize some paging parameters.
1774 */
1775 curthread->td_flags |= TDF_SYSTHREAD;
1776
1777 vm_pageout_free_page_calc(vmstats.v_page_count);
1778
1779 /*
1780 * v_free_target and v_cache_min control pageout hysteresis. Note
1781 * that these are more a measure of the VM cache queue hysteresis
1782 * then the VM free queue. Specifically, v_free_target is the
1783 * high water mark (free+cache pages).
1784 *
1785 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
1786 * low water mark, while v_free_min is the stop. v_cache_min must
1787 * be big enough to handle memory needs while the pageout daemon
1788 * is signalled and run to free more pages.
1789 */
1790 if (vmstats.v_free_count > 6144)
1791 vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
1792 else
1793 vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
1794
1795 /*
1796 * NOTE: With the new buffer cache b_act_count we want the default
1797 * inactive target to be a percentage of available memory.
1798 *
1799 * The inactive target essentially determines the minimum
1800 * number of 'temporary' pages capable of caching one-time-use
1801 * files when the VM system is otherwise full of pages
1802 * belonging to multi-time-use files or active program data.
1803 *
1804 * NOTE: The inactive target is aggressively persued only if the
1805 * inactive queue becomes too small. If the inactive queue
1806 * is large enough to satisfy page movement to free+cache
1807 * then it is repopulated more slowly from the active queue.
1808 * This allows a general inactive_target default to be set.
1809 *
1810 * There is an issue here for processes which sit mostly idle
1811 * 'overnight', such as sshd, tcsh, and X. Any movement from
1812 * the active queue will eventually cause such pages to
1813 * recycle eventually causing a lot of paging in the morning.
1814 * To reduce the incidence of this pages cycled out of the
1815 * buffer cache are moved directly to the inactive queue if
1816 * they were only used once or twice.
1817 *
1818 * The vfs.vm_cycle_point sysctl can be used to adjust this.
1819 * Increasing the value (up to 64) increases the number of
1820 * buffer recyclements which go directly to the inactive queue.
1821 */
1822 if (vmstats.v_free_count > 2048) {
1823 vmstats.v_cache_min = vmstats.v_free_target;
1824 vmstats.v_cache_max = 2 * vmstats.v_cache_min;
1825 } else {
1826 vmstats.v_cache_min = 0;
1827 vmstats.v_cache_max = 0;
1828 }
1829 vmstats.v_inactive_target = vmstats.v_free_count / 4;
1830
1831 /* XXX does not really belong here */
1832 if (vm_page_max_wired == 0)
1833 vm_page_max_wired = vmstats.v_free_count / 3;
1834
1835 if (vm_pageout_stats_max == 0)
1836 vm_pageout_stats_max = vmstats.v_free_target;
1837
1838 /*
1839 * Set interval in seconds for stats scan.
1840 */
1841 if (vm_pageout_stats_interval == 0)
1842 vm_pageout_stats_interval = 5;
1843 if (vm_pageout_full_stats_interval == 0)
1844 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
1845
1846
1847 /*
1848 * Set maximum free per pass
1849 */
1850 if (vm_pageout_stats_free_max == 0)
1851 vm_pageout_stats_free_max = 5;
1852
1853 swap_pager_swap_init();
1854 pass = 0;
1855
1856 /*
1857 * The pageout daemon is never done, so loop forever.
1858 */
1859 while (TRUE) {
1860 int error;
1861 int avail_shortage;
1862 int inactive_shortage;
1863 int vnodes_skipped = 0;
1864 int recycle_count = 0;
1865 int tmp;
1866
1867 /*
1868 * Wait for an action request. If we timeout check to
1869 * see if paging is needed (in case the normal wakeup
1870 * code raced us).
1871 */
1872 if (vm_pages_needed == 0) {
1873 error = tsleep(&vm_pages_needed,
1874 0, "psleep",
1875 vm_pageout_stats_interval * hz);
1876 if (error &&
1877 vm_paging_needed() == 0 &&
1878 vm_pages_needed == 0) {
1879 for (q = 0; q < PQ_L2_SIZE; ++q)
1880 vm_pageout_page_stats(q);
1881 continue;
1882 }
1883 vm_pages_needed = 1;
1884 }
1885
1886 mycpu->gd_cnt.v_pdwakeups++;
1887
1888 /*
1889 * Do whatever cleanup that the pmap code can.
1890 */
1891 pmap_collect();
1892
1893 /*
1894 * Scan for pageout. Try to avoid thrashing the system
1895 * with activity.
1896 *
1897 * Calculate our target for the number of free+cache pages we
1898 * want to get to. This is higher then the number that causes
1899 * allocations to stall (severe) in order to provide hysteresis,
1900 * and if we don't make it all the way but get to the minimum
1901 * we're happy. Goose it a bit if there are multipler
1902 * requests for memory.
1903 */
1904 avail_shortage = vm_paging_target() + vm_pageout_deficit;
1905 vm_pageout_deficit = 0;
1906
1907 if (avail_shortage > 0) {
1908 for (q = 0; q < PQ_L2_SIZE; ++q) {
1909 avail_shortage -=
1910 vm_pageout_scan_inactive(
1911 pass,
1912 (q + q1iterator) & PQ_L2_MASK,
1913 PQAVERAGE(avail_shortage),
1914 &vnodes_skipped);
1915 if (avail_shortage <= 0)
1916 break;
1917 }
1918 q1iterator = q + 1;
1919 }
1920
1921 /*
1922 * Figure out how many active pages we must deactivate. If
1923 * we were able to reach our target with just the inactive
1924 * scan above we limit the number of active pages we
1925 * deactivate to reduce unnecessary work.
1926 */
1927 inactive_shortage = vmstats.v_inactive_target -
1928 vmstats.v_inactive_count;
1929
1930 /*
1931 * If we were unable to free sufficient inactive pages to
1932 * satisfy the free/cache queue requirements then simply
1933 * reaching the inactive target may not be good enough.
1934 * Try to deactivate pages in excess of the target based
1935 * on the shortfall.
1936 *
1937 * However to prevent thrashing the VM system do not
1938 * deactivate more than an additional 1/10 the inactive
1939 * target's worth of active pages.
1940 */
1941 if (avail_shortage > 0) {
1942 tmp = avail_shortage * 2;
1943 if (tmp > vmstats.v_inactive_target / 10)
1944 tmp = vmstats.v_inactive_target / 10;
1945 inactive_shortage += tmp;
1946 }
1947
1948 /*
1949 * Only trigger on inactive shortage. Triggering on
1950 * avail_shortage can starve the active queue with
1951 * unnecessary active->inactive transitions and destroy
1952 * performance.
1953 */
1954 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
1955 int delta;
1956
1957 for (q = 0; q < PQ_L2_SIZE; ++q) {
1958 delta = vm_pageout_scan_active(
1959 pass,
1960 (q + q2iterator) & PQ_L2_MASK,
1961 PQAVERAGE(avail_shortage),
1962 PQAVERAGE(inactive_shortage),
1963 &recycle_count);
1964 inactive_shortage -= delta;
1965 avail_shortage -= delta;
1966 if (inactive_shortage <= 0 &&
1967 avail_shortage <= 0) {
1968 break;
1969 }
1970 }
1971 q2iterator = q + 1;
1972 }
1973
1974 /*
1975 * Finally free enough cache pages to meet our free page
1976 * requirement and take more drastic measures if we are
1977 * still in trouble.
1978 */
1979 vm_pageout_scan_cache(avail_shortage, vnodes_skipped,
1980 recycle_count);
1981
1982 /*
1983 * Wait for more work.
1984 */
1985 if (avail_shortage > 0) {
1986 ++pass;
1987 if (swap_pager_full) {
1988 /*
1989 * Running out of memory, catastrophic back-off
1990 * to one-second intervals.
1991 */
1992 tsleep(&vm_pages_needed, 0, "pdelay", hz);
1993 } else if (pass < 10 && vm_pages_needed > 1) {
1994 /*
1995 * Normal operation, additional processes
1996 * have already kicked us. Retry immediately.
1997 */
1998 } else if (pass < 10) {
1999 /*
2000 * Normal operation, fewer processes. Delay
2001 * a bit but allow wakeups.
2002 */
2003 vm_pages_needed = 0;
2004 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2005 vm_pages_needed = 1;
2006 } else {
2007 /*
2008 * We've taken too many passes, forced delay.
2009 */
2010 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2011 }
2012 } else if (vm_pages_needed) {
2013 /*
2014 * Interlocked wakeup of waiters (non-optional).
2015 *
2016 * Similar to vm_page_free_wakeup() in vm_page.c,
2017 * wake
2018 */
2019 pass = 0;
2020 if (!vm_page_count_min(vm_page_free_hysteresis) ||
2021 !vm_page_count_target()) {
2022 vm_pages_needed = 0;
2023 wakeup(&vmstats.v_free_count);
2024 }
2025 } else {
2026 pass = 0;
2027 }
2028 }
2029 }
2030
2031 static struct kproc_desc page_kp = {
2032 "pagedaemon",
2033 vm_pageout_thread,
2034 &pagethread
2035 };
2036 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
2037
2038
2039 /*
2040 * Called after allocating a page out of the cache or free queue
2041 * to possibly wake the pagedaemon up to replentish our supply.
2042 *
2043 * We try to generate some hysteresis by waking the pagedaemon up
2044 * when our free+cache pages go below the free_min+cache_min level.
2045 * The pagedaemon tries to get the count back up to at least the
2046 * minimum, and through to the target level if possible.
2047 *
2048 * If the pagedaemon is already active bump vm_pages_needed as a hint
2049 * that there are even more requests pending.
2050 *
2051 * SMP races ok?
2052 * No requirements.
2053 */
2054 void
2055 pagedaemon_wakeup(void)
2056 {
2057 if (vm_paging_needed() && curthread != pagethread) {
2058 if (vm_pages_needed == 0) {
2059 vm_pages_needed = 1; /* SMP race ok */
2060 wakeup(&vm_pages_needed);
2061 } else if (vm_page_count_min(0)) {
2062 ++vm_pages_needed; /* SMP race ok */
2063 }
2064 }
2065 }
2066
2067 #if !defined(NO_SWAPPING)
2068
2069 /*
2070 * SMP races ok?
2071 * No requirements.
2072 */
2073 static void
2074 vm_req_vmdaemon(void)
2075 {
2076 static int lastrun = 0;
2077
2078 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2079 wakeup(&vm_daemon_needed);
2080 lastrun = ticks;
2081 }
2082 }
2083
2084 static int vm_daemon_callback(struct proc *p, void *data __unused);
2085
2086 /*
2087 * No requirements.
2088 */
2089 static void
2090 vm_daemon(void)
2091 {
2092 /*
2093 * XXX vm_daemon_needed specific token?
2094 */
2095 while (TRUE) {
2096 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2097 if (vm_pageout_req_swapout) {
2098 swapout_procs(vm_pageout_req_swapout);
2099 vm_pageout_req_swapout = 0;
2100 }
2101 /*
2102 * scan the processes for exceeding their rlimits or if
2103 * process is swapped out -- deactivate pages
2104 */
2105 allproc_scan(vm_daemon_callback, NULL);
2106 }
2107 }
2108
2109 static int
2110 vm_daemon_callback(struct proc *p, void *data __unused)
2111 {
2112 struct vmspace *vm;
2113 vm_pindex_t limit, size;
2114
2115 /*
2116 * if this is a system process or if we have already
2117 * looked at this process, skip it.
2118 */
2119 lwkt_gettoken(&p->p_token);
2120
2121 if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2122 lwkt_reltoken(&p->p_token);
2123 return (0);
2124 }
2125
2126 /*
2127 * if the process is in a non-running type state,
2128 * don't touch it.
2129 */
2130 if (p->p_stat != SACTIVE && p->p_stat != SSTOP) {
2131 lwkt_reltoken(&p->p_token);
2132 return (0);
2133 }
2134
2135 /*
2136 * get a limit
2137 */
2138 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2139 p->p_rlimit[RLIMIT_RSS].rlim_max));
2140
2141 /*
2142 * let processes that are swapped out really be
2143 * swapped out. Set the limit to nothing to get as
2144 * many pages out to swap as possible.
2145 */
2146 if (p->p_flags & P_SWAPPEDOUT)
2147 limit = 0;
2148
2149 vm = p->p_vmspace;
2150 vmspace_hold(vm);
2151 size = vmspace_resident_count(vm);
2152 if (limit >= 0 && size >= limit) {
2153 vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2154 }
2155 vmspace_drop(vm);
2156
2157 lwkt_reltoken(&p->p_token);
2158
2159 return (0);
2160 }
2161
2162 #endif
Cache object: 7e3abece1eafc1d526f0dfe4118dd76d
|