FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_swapout.c
1 /*-
2 * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
3 *
4 * Copyright (c) 1991 Regents of the University of California.
5 * All rights reserved.
6 * Copyright (c) 1994 John S. Dyson
7 * All rights reserved.
8 * Copyright (c) 1994 David Greenman
9 * All rights reserved.
10 * Copyright (c) 2005 Yahoo! Technologies Norway AS
11 * All rights reserved.
12 *
13 * This code is derived from software contributed to Berkeley by
14 * The Mach Operating System project at Carnegie-Mellon University.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 * 3. All advertising materials mentioning features or use of this software
25 * must display the following acknowledgement:
26 * This product includes software developed by the University of
27 * California, Berkeley and its contributors.
28 * 4. Neither the name of the University nor the names of its contributors
29 * may be used to endorse or promote products derived from this software
30 * without specific prior written permission.
31 *
32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42 * SUCH DAMAGE.
43 *
44 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
45 *
46 *
47 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
48 * All rights reserved.
49 *
50 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
51 *
52 * Permission to use, copy, modify and distribute this software and
53 * its documentation is hereby granted, provided that both the copyright
54 * notice and this permission notice appear in all copies of the
55 * software, derivative works or modified versions, and any portions
56 * thereof, and that both notices appear in supporting documentation.
57 *
58 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
59 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
60 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
61 *
62 * Carnegie Mellon requests users of this software to return to
63 *
64 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
65 * School of Computer Science
66 * Carnegie Mellon University
67 * Pittsburgh PA 15213-3890
68 *
69 * any improvements or extensions that they make and grant Carnegie the
70 * rights to redistribute these changes.
71 */
72
73 #include <sys/cdefs.h>
74 __FBSDID("$FreeBSD$");
75
76 #include "opt_kstack_pages.h"
77 #include "opt_kstack_max_pages.h"
78 #include "opt_vm.h"
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/limits.h>
83 #include <sys/kernel.h>
84 #include <sys/eventhandler.h>
85 #include <sys/lock.h>
86 #include <sys/mutex.h>
87 #include <sys/proc.h>
88 #include <sys/_kstack_cache.h>
89 #include <sys/kthread.h>
90 #include <sys/ktr.h>
91 #include <sys/mount.h>
92 #include <sys/racct.h>
93 #include <sys/resourcevar.h>
94 #include <sys/sched.h>
95 #include <sys/sdt.h>
96 #include <sys/signalvar.h>
97 #include <sys/smp.h>
98 #include <sys/time.h>
99 #include <sys/vnode.h>
100 #include <sys/vmmeter.h>
101 #include <sys/rwlock.h>
102 #include <sys/sx.h>
103 #include <sys/sysctl.h>
104
105 #include <vm/vm.h>
106 #include <vm/vm_param.h>
107 #include <vm/vm_object.h>
108 #include <vm/vm_page.h>
109 #include <vm/vm_map.h>
110 #include <vm/vm_pageout.h>
111 #include <vm/vm_pager.h>
112 #include <vm/vm_phys.h>
113 #include <vm/swap_pager.h>
114 #include <vm/vm_extern.h>
115 #include <vm/uma.h>
116
117 /* the kernel process "vm_daemon" */
118 static void vm_daemon(void);
119 static struct proc *vmproc;
120
121 static struct kproc_desc vm_kp = {
122 "vmdaemon",
123 vm_daemon,
124 &vmproc
125 };
126 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
127
128 static int vm_swap_enabled = 1;
129 static int vm_swap_idle_enabled = 0;
130
131 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW,
132 &vm_swap_enabled, 0,
133 "Enable entire process swapout");
134 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW,
135 &vm_swap_idle_enabled, 0,
136 "Allow swapout on idle criteria");
137
138 /*
139 * Swap_idle_threshold1 is the guaranteed swapped in time for a process
140 */
141 static int swap_idle_threshold1 = 2;
142 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
143 &swap_idle_threshold1, 0,
144 "Guaranteed swapped in time for a process");
145
146 /*
147 * Swap_idle_threshold2 is the time that a process can be idle before
148 * it will be swapped out, if idle swapping is enabled.
149 */
150 static int swap_idle_threshold2 = 10;
151 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
152 &swap_idle_threshold2, 0,
153 "Time before a process will be swapped out");
154
155 static int vm_pageout_req_swapout; /* XXX */
156 static int vm_daemon_needed;
157 static struct mtx vm_daemon_mtx;
158 /* Allow for use by vm_pageout before vm_daemon is initialized. */
159 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
160
161 static int swapped_cnt;
162 static int swap_inprogress; /* Pending swap-ins done outside swapper. */
163 static int last_swapin;
164
165 static void swapclear(struct proc *);
166 static int swapout(struct proc *);
167 static void vm_swapout_map_deactivate_pages(vm_map_t, long);
168 static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long);
169 static void swapout_procs(int action);
170 static void vm_req_vmdaemon(int req);
171 static void vm_thread_swapout(struct thread *td);
172
173 /*
174 * vm_swapout_object_deactivate_pages
175 *
176 * Deactivate enough pages to satisfy the inactive target
177 * requirements.
178 *
179 * The object and map must be locked.
180 */
181 static void
182 vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
183 long desired)
184 {
185 vm_object_t backing_object, object;
186 vm_page_t p;
187 int act_delta, remove_mode;
188
189 VM_OBJECT_ASSERT_LOCKED(first_object);
190 if ((first_object->flags & OBJ_FICTITIOUS) != 0)
191 return;
192 for (object = first_object;; object = backing_object) {
193 if (pmap_resident_count(pmap) <= desired)
194 goto unlock_return;
195 VM_OBJECT_ASSERT_LOCKED(object);
196 if ((object->flags & OBJ_UNMANAGED) != 0 ||
197 object->paging_in_progress != 0)
198 goto unlock_return;
199
200 remove_mode = 0;
201 if (object->shadow_count > 1)
202 remove_mode = 1;
203 /*
204 * Scan the object's entire memory queue.
205 */
206 TAILQ_FOREACH(p, &object->memq, listq) {
207 if (pmap_resident_count(pmap) <= desired)
208 goto unlock_return;
209 if (should_yield())
210 goto unlock_return;
211 if (vm_page_busied(p))
212 continue;
213 VM_CNT_INC(v_pdpages);
214 vm_page_lock(p);
215 if (vm_page_held(p) ||
216 !pmap_page_exists_quick(pmap, p)) {
217 vm_page_unlock(p);
218 continue;
219 }
220 act_delta = pmap_ts_referenced(p);
221 if ((p->aflags & PGA_REFERENCED) != 0) {
222 if (act_delta == 0)
223 act_delta = 1;
224 vm_page_aflag_clear(p, PGA_REFERENCED);
225 }
226 if (!vm_page_active(p) && act_delta != 0) {
227 vm_page_activate(p);
228 p->act_count += act_delta;
229 } else if (vm_page_active(p)) {
230 /*
231 * The page daemon does not requeue pages
232 * after modifying their activation count.
233 */
234 if (act_delta == 0) {
235 p->act_count -= min(p->act_count,
236 ACT_DECLINE);
237 if (!remove_mode && p->act_count == 0) {
238 pmap_remove_all(p);
239 vm_page_deactivate(p);
240 }
241 } else {
242 vm_page_activate(p);
243 if (p->act_count < ACT_MAX -
244 ACT_ADVANCE)
245 p->act_count += ACT_ADVANCE;
246 }
247 } else if (vm_page_inactive(p))
248 pmap_remove_all(p);
249 vm_page_unlock(p);
250 }
251 if ((backing_object = object->backing_object) == NULL)
252 goto unlock_return;
253 VM_OBJECT_RLOCK(backing_object);
254 if (object != first_object)
255 VM_OBJECT_RUNLOCK(object);
256 }
257 unlock_return:
258 if (object != first_object)
259 VM_OBJECT_RUNLOCK(object);
260 }
261
262 /*
263 * deactivate some number of pages in a map, try to do it fairly, but
264 * that is really hard to do.
265 */
266 static void
267 vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
268 {
269 vm_map_entry_t tmpe;
270 vm_object_t obj, bigobj;
271 int nothingwired;
272
273 if (!vm_map_trylock_read(map))
274 return;
275
276 bigobj = NULL;
277 nothingwired = TRUE;
278
279 /*
280 * first, search out the biggest object, and try to free pages from
281 * that.
282 */
283 tmpe = map->header.next;
284 while (tmpe != &map->header) {
285 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
286 obj = tmpe->object.vm_object;
287 if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
288 if (obj->shadow_count <= 1 &&
289 (bigobj == NULL ||
290 bigobj->resident_page_count <
291 obj->resident_page_count)) {
292 if (bigobj != NULL)
293 VM_OBJECT_RUNLOCK(bigobj);
294 bigobj = obj;
295 } else
296 VM_OBJECT_RUNLOCK(obj);
297 }
298 }
299 if (tmpe->wired_count > 0)
300 nothingwired = FALSE;
301 tmpe = tmpe->next;
302 }
303
304 if (bigobj != NULL) {
305 vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired);
306 VM_OBJECT_RUNLOCK(bigobj);
307 }
308 /*
309 * Next, hunt around for other pages to deactivate. We actually
310 * do this search sort of wrong -- .text first is not the best idea.
311 */
312 tmpe = map->header.next;
313 while (tmpe != &map->header) {
314 if (pmap_resident_count(vm_map_pmap(map)) <= desired)
315 break;
316 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
317 obj = tmpe->object.vm_object;
318 if (obj != NULL) {
319 VM_OBJECT_RLOCK(obj);
320 vm_swapout_object_deactivate_pages(map->pmap,
321 obj, desired);
322 VM_OBJECT_RUNLOCK(obj);
323 }
324 }
325 tmpe = tmpe->next;
326 }
327
328 /*
329 * Remove all mappings if a process is swapped out, this will free page
330 * table pages.
331 */
332 if (desired == 0 && nothingwired) {
333 pmap_remove(vm_map_pmap(map), vm_map_min(map),
334 vm_map_max(map));
335 }
336
337 vm_map_unlock_read(map);
338 }
339
340 /*
341 * Swap out requests
342 */
343 #define VM_SWAP_NORMAL 1
344 #define VM_SWAP_IDLE 2
345
346 void
347 vm_swapout_run(void)
348 {
349
350 if (vm_swap_enabled)
351 vm_req_vmdaemon(VM_SWAP_NORMAL);
352 }
353
354 /*
355 * Idle process swapout -- run once per second when pagedaemons are
356 * reclaiming pages.
357 */
358 void
359 vm_swapout_run_idle(void)
360 {
361 static long lsec;
362
363 if (!vm_swap_idle_enabled || time_second == lsec)
364 return;
365 vm_req_vmdaemon(VM_SWAP_IDLE);
366 lsec = time_second;
367 }
368
369 static void
370 vm_req_vmdaemon(int req)
371 {
372 static int lastrun = 0;
373
374 mtx_lock(&vm_daemon_mtx);
375 vm_pageout_req_swapout |= req;
376 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
377 wakeup(&vm_daemon_needed);
378 lastrun = ticks;
379 }
380 mtx_unlock(&vm_daemon_mtx);
381 }
382
383 static void
384 vm_daemon(void)
385 {
386 struct rlimit rsslim;
387 struct proc *p;
388 struct thread *td;
389 struct vmspace *vm;
390 int breakout, swapout_flags, tryagain, attempts;
391 #ifdef RACCT
392 uint64_t rsize, ravailable;
393 #endif
394
395 while (TRUE) {
396 mtx_lock(&vm_daemon_mtx);
397 msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
398 #ifdef RACCT
399 racct_enable ? hz : 0
400 #else
401 0
402 #endif
403 );
404 swapout_flags = vm_pageout_req_swapout;
405 vm_pageout_req_swapout = 0;
406 mtx_unlock(&vm_daemon_mtx);
407 if (swapout_flags != 0) {
408 /*
409 * Drain the per-CPU page queue batches as a deadlock
410 * avoidance measure.
411 */
412 if ((swapout_flags & VM_SWAP_NORMAL) != 0)
413 vm_page_drain_pqbatch();
414 swapout_procs(swapout_flags);
415 }
416
417 /*
418 * scan the processes for exceeding their rlimits or if
419 * process is swapped out -- deactivate pages
420 */
421 tryagain = 0;
422 attempts = 0;
423 again:
424 attempts++;
425 sx_slock(&allproc_lock);
426 FOREACH_PROC_IN_SYSTEM(p) {
427 vm_pindex_t limit, size;
428
429 /*
430 * if this is a system process or if we have already
431 * looked at this process, skip it.
432 */
433 PROC_LOCK(p);
434 if (p->p_state != PRS_NORMAL ||
435 p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
436 PROC_UNLOCK(p);
437 continue;
438 }
439 /*
440 * if the process is in a non-running type state,
441 * don't touch it.
442 */
443 breakout = 0;
444 FOREACH_THREAD_IN_PROC(p, td) {
445 thread_lock(td);
446 if (!TD_ON_RUNQ(td) &&
447 !TD_IS_RUNNING(td) &&
448 !TD_IS_SLEEPING(td) &&
449 !TD_IS_SUSPENDED(td)) {
450 thread_unlock(td);
451 breakout = 1;
452 break;
453 }
454 thread_unlock(td);
455 }
456 if (breakout) {
457 PROC_UNLOCK(p);
458 continue;
459 }
460 /*
461 * get a limit
462 */
463 lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
464 limit = OFF_TO_IDX(
465 qmin(rsslim.rlim_cur, rsslim.rlim_max));
466
467 /*
468 * let processes that are swapped out really be
469 * swapped out set the limit to nothing (will force a
470 * swap-out.)
471 */
472 if ((p->p_flag & P_INMEM) == 0)
473 limit = 0; /* XXX */
474 vm = vmspace_acquire_ref(p);
475 _PHOLD_LITE(p);
476 PROC_UNLOCK(p);
477 if (vm == NULL) {
478 PRELE(p);
479 continue;
480 }
481 sx_sunlock(&allproc_lock);
482
483 size = vmspace_resident_count(vm);
484 if (size >= limit) {
485 vm_swapout_map_deactivate_pages(
486 &vm->vm_map, limit);
487 size = vmspace_resident_count(vm);
488 }
489 #ifdef RACCT
490 if (racct_enable) {
491 rsize = IDX_TO_OFF(size);
492 PROC_LOCK(p);
493 if (p->p_state == PRS_NORMAL)
494 racct_set(p, RACCT_RSS, rsize);
495 ravailable = racct_get_available(p, RACCT_RSS);
496 PROC_UNLOCK(p);
497 if (rsize > ravailable) {
498 /*
499 * Don't be overly aggressive; this
500 * might be an innocent process,
501 * and the limit could've been exceeded
502 * by some memory hog. Don't try
503 * to deactivate more than 1/4th
504 * of process' resident set size.
505 */
506 if (attempts <= 8) {
507 if (ravailable < rsize -
508 (rsize / 4)) {
509 ravailable = rsize -
510 (rsize / 4);
511 }
512 }
513 vm_swapout_map_deactivate_pages(
514 &vm->vm_map,
515 OFF_TO_IDX(ravailable));
516 /* Update RSS usage after paging out. */
517 size = vmspace_resident_count(vm);
518 rsize = IDX_TO_OFF(size);
519 PROC_LOCK(p);
520 if (p->p_state == PRS_NORMAL)
521 racct_set(p, RACCT_RSS, rsize);
522 PROC_UNLOCK(p);
523 if (rsize > ravailable)
524 tryagain = 1;
525 }
526 }
527 #endif
528 vmspace_free(vm);
529 sx_slock(&allproc_lock);
530 PRELE(p);
531 }
532 sx_sunlock(&allproc_lock);
533 if (tryagain != 0 && attempts <= 10) {
534 maybe_yield();
535 goto again;
536 }
537 }
538 }
539
540 /*
541 * Allow a thread's kernel stack to be paged out.
542 */
543 static void
544 vm_thread_swapout(struct thread *td)
545 {
546 vm_object_t ksobj;
547 vm_page_t m;
548 int i, pages;
549
550 cpu_thread_swapout(td);
551 pages = td->td_kstack_pages;
552 ksobj = td->td_kstack_obj;
553 pmap_qremove(td->td_kstack, pages);
554 VM_OBJECT_WLOCK(ksobj);
555 for (i = 0; i < pages; i++) {
556 m = vm_page_lookup(ksobj, i);
557 if (m == NULL)
558 panic("vm_thread_swapout: kstack already missing?");
559 vm_page_dirty(m);
560 vm_page_lock(m);
561 vm_page_unwire(m, PQ_LAUNDRY);
562 vm_page_unlock(m);
563 }
564 VM_OBJECT_WUNLOCK(ksobj);
565 }
566
567 /*
568 * Bring the kernel stack for a specified thread back in.
569 */
570 static void
571 vm_thread_swapin(struct thread *td, int oom_alloc)
572 {
573 vm_object_t ksobj;
574 vm_page_t ma[KSTACK_MAX_PAGES];
575 int a, count, i, j, pages, rv;
576
577 pages = td->td_kstack_pages;
578 ksobj = td->td_kstack_obj;
579 VM_OBJECT_WLOCK(ksobj);
580 (void)vm_page_grab_pages(ksobj, 0, oom_alloc | VM_ALLOC_WIRED, ma,
581 pages);
582 for (i = 0; i < pages;) {
583 vm_page_assert_xbusied(ma[i]);
584 if (ma[i]->valid == VM_PAGE_BITS_ALL) {
585 vm_page_xunbusy(ma[i]);
586 i++;
587 continue;
588 }
589 vm_object_pip_add(ksobj, 1);
590 for (j = i + 1; j < pages; j++)
591 if (ma[j]->valid == VM_PAGE_BITS_ALL)
592 break;
593 rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
594 KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
595 count = min(a + 1, j - i);
596 rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
597 KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
598 __func__, td->td_proc->p_pid));
599 vm_object_pip_wakeup(ksobj);
600 for (j = i; j < i + count; j++)
601 vm_page_xunbusy(ma[j]);
602 i += count;
603 }
604 VM_OBJECT_WUNLOCK(ksobj);
605 pmap_qenter(td->td_kstack, ma, pages);
606 cpu_thread_swapin(td);
607 }
608
609 void
610 faultin(struct proc *p)
611 {
612 struct thread *td;
613 int oom_alloc;
614
615 PROC_LOCK_ASSERT(p, MA_OWNED);
616
617 /*
618 * If another process is swapping in this process,
619 * just wait until it finishes.
620 */
621 if (p->p_flag & P_SWAPPINGIN) {
622 while (p->p_flag & P_SWAPPINGIN)
623 msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
624 return;
625 }
626
627 if ((p->p_flag & P_INMEM) == 0) {
628 oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM :
629 VM_ALLOC_NORMAL;
630
631 /*
632 * Don't let another thread swap process p out while we are
633 * busy swapping it in.
634 */
635 ++p->p_lock;
636 p->p_flag |= P_SWAPPINGIN;
637 PROC_UNLOCK(p);
638 sx_xlock(&allproc_lock);
639 MPASS(swapped_cnt > 0);
640 swapped_cnt--;
641 if (curthread != &thread0)
642 swap_inprogress++;
643 sx_xunlock(&allproc_lock);
644
645 /*
646 * We hold no lock here because the list of threads
647 * can not change while all threads in the process are
648 * swapped out.
649 */
650 FOREACH_THREAD_IN_PROC(p, td)
651 vm_thread_swapin(td, oom_alloc);
652
653 if (curthread != &thread0) {
654 sx_xlock(&allproc_lock);
655 MPASS(swap_inprogress > 0);
656 swap_inprogress--;
657 last_swapin = ticks;
658 sx_xunlock(&allproc_lock);
659 }
660 PROC_LOCK(p);
661 swapclear(p);
662 p->p_swtick = ticks;
663
664 /* Allow other threads to swap p out now. */
665 wakeup(&p->p_flag);
666 --p->p_lock;
667 }
668 }
669
670 /*
671 * This swapin algorithm attempts to swap-in processes only if there
672 * is enough space for them. Of course, if a process waits for a long
673 * time, it will be swapped in anyway.
674 */
675
676 static struct proc *
677 swapper_selector(bool wkilled_only)
678 {
679 struct proc *p, *res;
680 struct thread *td;
681 int ppri, pri, slptime, swtime;
682
683 sx_assert(&allproc_lock, SA_SLOCKED);
684 if (swapped_cnt == 0)
685 return (NULL);
686 res = NULL;
687 ppri = INT_MIN;
688 FOREACH_PROC_IN_SYSTEM(p) {
689 PROC_LOCK(p);
690 if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT |
691 P_SWAPPINGIN | P_INMEM)) != 0) {
692 PROC_UNLOCK(p);
693 continue;
694 }
695 if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) {
696 /*
697 * A swapped-out process might have mapped a
698 * large portion of the system's pages as
699 * anonymous memory. There is no other way to
700 * release the memory other than to kill the
701 * process, for which we need to swap it in.
702 */
703 return (p);
704 }
705 if (wkilled_only) {
706 PROC_UNLOCK(p);
707 continue;
708 }
709 swtime = (ticks - p->p_swtick) / hz;
710 FOREACH_THREAD_IN_PROC(p, td) {
711 /*
712 * An otherwise runnable thread of a process
713 * swapped out has only the TDI_SWAPPED bit set.
714 */
715 thread_lock(td);
716 if (td->td_inhibitors == TDI_SWAPPED) {
717 slptime = (ticks - td->td_slptick) / hz;
718 pri = swtime + slptime;
719 if ((td->td_flags & TDF_SWAPINREQ) == 0)
720 pri -= p->p_nice * 8;
721 /*
722 * if this thread is higher priority
723 * and there is enough space, then select
724 * this process instead of the previous
725 * selection.
726 */
727 if (pri > ppri) {
728 res = p;
729 ppri = pri;
730 }
731 }
732 thread_unlock(td);
733 }
734 PROC_UNLOCK(p);
735 }
736
737 if (res != NULL)
738 PROC_LOCK(res);
739 return (res);
740 }
741
742 #define SWAPIN_INTERVAL (MAXSLP * hz / 2)
743
744 /*
745 * Limit swapper to swap in one non-WKILLED process in MAXSLP/2
746 * interval, assuming that there is:
747 * - there exists at least one domain that is not suffering from a shortage of
748 * free memory;
749 * - no parallel swap-ins;
750 * - no other swap-ins in the current SWAPIN_INTERVAL.
751 */
752 static bool
753 swapper_wkilled_only(void)
754 {
755
756 return (vm_page_count_min_set(&all_domains) || swap_inprogress > 0 ||
757 (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL);
758 }
759
760 void
761 swapper(void)
762 {
763 struct proc *p;
764
765 for (;;) {
766 sx_slock(&allproc_lock);
767 p = swapper_selector(swapper_wkilled_only());
768 sx_sunlock(&allproc_lock);
769
770 if (p == NULL) {
771 tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL);
772 } else {
773 PROC_LOCK_ASSERT(p, MA_OWNED);
774
775 /*
776 * Another process may be bringing or may have
777 * already brought this process in while we
778 * traverse all threads. Or, this process may
779 * have exited or even being swapped out
780 * again.
781 */
782 if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM |
783 P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) {
784 faultin(p);
785 }
786 PROC_UNLOCK(p);
787 }
788 }
789 }
790
791 /*
792 * First, if any processes have been sleeping or stopped for at least
793 * "swap_idle_threshold1" seconds, they are swapped out. If, however,
794 * no such processes exist, then the longest-sleeping or stopped
795 * process is swapped out. Finally, and only as a last resort, if
796 * there are no sleeping or stopped processes, the longest-resident
797 * process is swapped out.
798 */
799 static void
800 swapout_procs(int action)
801 {
802 struct proc *p;
803 struct thread *td;
804 int slptime;
805 bool didswap, doswap;
806
807 MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0);
808
809 didswap = false;
810 sx_slock(&allproc_lock);
811 FOREACH_PROC_IN_SYSTEM(p) {
812 /*
813 * Filter out not yet fully constructed processes. Do
814 * not swap out held processes. Avoid processes which
815 * are system, exiting, execing, traced, already swapped
816 * out or are in the process of being swapped in or out.
817 */
818 PROC_LOCK(p);
819 if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag &
820 (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE |
821 P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) !=
822 P_INMEM) {
823 PROC_UNLOCK(p);
824 continue;
825 }
826
827 /*
828 * Further consideration of this process for swap out
829 * requires iterating over its threads. We release
830 * allproc_lock here so that process creation and
831 * destruction are not blocked while we iterate.
832 *
833 * To later reacquire allproc_lock and resume
834 * iteration over the allproc list, we will first have
835 * to release the lock on the process. We place a
836 * hold on the process so that it remains in the
837 * allproc list while it is unlocked.
838 */
839 _PHOLD_LITE(p);
840 sx_sunlock(&allproc_lock);
841
842 /*
843 * Do not swapout a realtime process.
844 * Guarantee swap_idle_threshold1 time in memory.
845 * If the system is under memory stress, or if we are
846 * swapping idle processes >= swap_idle_threshold2,
847 * then swap the process out.
848 */
849 doswap = true;
850 FOREACH_THREAD_IN_PROC(p, td) {
851 thread_lock(td);
852 slptime = (ticks - td->td_slptick) / hz;
853 if (PRI_IS_REALTIME(td->td_pri_class) ||
854 slptime < swap_idle_threshold1 ||
855 !thread_safetoswapout(td) ||
856 ((action & VM_SWAP_NORMAL) == 0 &&
857 slptime < swap_idle_threshold2))
858 doswap = false;
859 thread_unlock(td);
860 if (!doswap)
861 break;
862 }
863 if (doswap && swapout(p) == 0)
864 didswap = true;
865
866 PROC_UNLOCK(p);
867 if (didswap) {
868 sx_xlock(&allproc_lock);
869 swapped_cnt++;
870 sx_downgrade(&allproc_lock);
871 } else
872 sx_slock(&allproc_lock);
873 PRELE(p);
874 }
875 sx_sunlock(&allproc_lock);
876
877 /*
878 * If we swapped something out, and another process needed memory,
879 * then wakeup the sched process.
880 */
881 if (didswap)
882 wakeup(&proc0);
883 }
884
885 static void
886 swapclear(struct proc *p)
887 {
888 struct thread *td;
889
890 PROC_LOCK_ASSERT(p, MA_OWNED);
891
892 FOREACH_THREAD_IN_PROC(p, td) {
893 thread_lock(td);
894 td->td_flags |= TDF_INMEM;
895 td->td_flags &= ~TDF_SWAPINREQ;
896 TD_CLR_SWAPPED(td);
897 if (TD_CAN_RUN(td))
898 if (setrunnable(td)) {
899 #ifdef INVARIANTS
900 /*
901 * XXX: We just cleared TDI_SWAPPED
902 * above and set TDF_INMEM, so this
903 * should never happen.
904 */
905 panic("not waking up swapper");
906 #endif
907 }
908 thread_unlock(td);
909 }
910 p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT);
911 p->p_flag |= P_INMEM;
912 }
913
914 static int
915 swapout(struct proc *p)
916 {
917 struct thread *td;
918
919 PROC_LOCK_ASSERT(p, MA_OWNED);
920
921 /*
922 * The states of this process and its threads may have changed
923 * by now. Assuming that there is only one pageout daemon thread,
924 * this process should still be in memory.
925 */
926 KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) ==
927 P_INMEM, ("swapout: lost a swapout race?"));
928
929 /*
930 * Remember the resident count.
931 */
932 p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
933
934 /*
935 * Check and mark all threads before we proceed.
936 */
937 p->p_flag &= ~P_INMEM;
938 p->p_flag |= P_SWAPPINGOUT;
939 FOREACH_THREAD_IN_PROC(p, td) {
940 thread_lock(td);
941 if (!thread_safetoswapout(td)) {
942 thread_unlock(td);
943 swapclear(p);
944 return (EBUSY);
945 }
946 td->td_flags &= ~TDF_INMEM;
947 TD_SET_SWAPPED(td);
948 thread_unlock(td);
949 }
950 td = FIRST_THREAD_IN_PROC(p);
951 ++td->td_ru.ru_nswap;
952 PROC_UNLOCK(p);
953
954 /*
955 * This list is stable because all threads are now prevented from
956 * running. The list is only modified in the context of a running
957 * thread in this process.
958 */
959 FOREACH_THREAD_IN_PROC(p, td)
960 vm_thread_swapout(td);
961
962 PROC_LOCK(p);
963 p->p_flag &= ~P_SWAPPINGOUT;
964 p->p_swtick = ticks;
965 return (0);
966 }
Cache object: acfe23cf3436cb45131b076bee267348
|