vm_pageout.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
    3  *
    4  * Copyright (c) 1991 Regents of the University of California.
    5  * All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  * Copyright (c) 2005 Yahoo! Technologies Norway AS
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * The Mach Operating System project at Carnegie-Mellon University.
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  * 1. Redistributions of source code must retain the above copyright
   20  *    notice, this list of conditions and the following disclaimer.
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  * 3. All advertising materials mentioning features or use of this software
   25  *    must display the following acknowledgement:
   26  *      This product includes software developed by the University of
   27  *      California, Berkeley and its contributors.
   28  * 4. Neither the name of the University nor the names of its contributors
   29  *    may be used to endorse or promote products derived from this software
   30  *    without specific prior written permission.
   31  *
   32  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   42  * SUCH DAMAGE.
   43  *
   44  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
   45  *
   46  *
   47  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   48  * All rights reserved.
   49  *
   50  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   51  *
   52  * Permission to use, copy, modify and distribute this software and
   53  * its documentation is hereby granted, provided that both the copyright
   54  * notice and this permission notice appear in all copies of the
   55  * software, derivative works or modified versions, and any portions
   56  * thereof, and that both notices appear in supporting documentation.
   57  *
   58  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   59  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   60  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   61  *
   62  * Carnegie Mellon requests users of this software to return to
   63  *
   64  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   65  *  School of Computer Science
   66  *  Carnegie Mellon University
   67  *  Pittsburgh PA 15213-3890
   68  *
   69  * any improvements or extensions that they make and grant Carnegie the
   70  * rights to redistribute these changes.
   71  */
   72 
   73 /*
   74  *      The proverbial page-out daemon.
   75  */
   76 
   77 #include <sys/cdefs.h>
   78 __FBSDID("$FreeBSD: releng/12.0/sys/vm/vm_pageout.c 340396 2018-11-13 16:51:30Z markj $");
   79 
   80 #include "opt_vm.h"
   81 
   82 #include <sys/param.h>
   83 #include <sys/systm.h>
   84 #include <sys/kernel.h>
   85 #include <sys/eventhandler.h>
   86 #include <sys/lock.h>
   87 #include <sys/mutex.h>
   88 #include <sys/proc.h>
   89 #include <sys/kthread.h>
   90 #include <sys/ktr.h>
   91 #include <sys/mount.h>
   92 #include <sys/racct.h>
   93 #include <sys/resourcevar.h>
   94 #include <sys/sched.h>
   95 #include <sys/sdt.h>
   96 #include <sys/signalvar.h>
   97 #include <sys/smp.h>
   98 #include <sys/time.h>
   99 #include <sys/vnode.h>
  100 #include <sys/vmmeter.h>
  101 #include <sys/rwlock.h>
  102 #include <sys/sx.h>
  103 #include <sys/sysctl.h>
  104 
  105 #include <vm/vm.h>
  106 #include <vm/vm_param.h>
  107 #include <vm/vm_object.h>
  108 #include <vm/vm_page.h>
  109 #include <vm/vm_map.h>
  110 #include <vm/vm_pageout.h>
  111 #include <vm/vm_pager.h>
  112 #include <vm/vm_phys.h>
  113 #include <vm/vm_pagequeue.h>
  114 #include <vm/swap_pager.h>
  115 #include <vm/vm_extern.h>
  116 #include <vm/uma.h>
  117 
  118 /*
  119  * System initialization
  120  */
  121 
  122 /* the kernel process "vm_pageout"*/
  123 static void vm_pageout(void);
  124 static void vm_pageout_init(void);
  125 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
  126 static int vm_pageout_cluster(vm_page_t m);
  127 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
  128     int starting_page_shortage);
  129 
  130 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
  131     NULL);
  132 
  133 struct proc *pageproc;
  134 
  135 static struct kproc_desc page_kp = {
  136         "pagedaemon",
  137         vm_pageout,
  138         &pageproc
  139 };
  140 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
  141     &page_kp);
  142 
  143 SDT_PROVIDER_DEFINE(vm);
  144 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
  145 
  146 /* Pagedaemon activity rates, in subdivisions of one second. */
  147 #define VM_LAUNDER_RATE         10
  148 #define VM_INACT_SCAN_RATE      10
  149 
  150 static int vm_pageout_oom_seq = 12;
  151 
  152 static int vm_pageout_update_period;
  153 static int disable_swap_pageouts;
  154 static int lowmem_period = 10;
  155 static int swapdev_enabled;
  156 
  157 static int vm_panic_on_oom = 0;
  158 
  159 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
  160         CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
  161         "panic on out of memory instead of killing the largest process");
  162 
  163 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
  164         CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
  165         "Maximum active LRU update period");
  166   
  167 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
  168         "Low memory callback period");
  169 
  170 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
  171         CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
  172 
  173 static int pageout_lock_miss;
  174 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
  175         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
  176 
  177 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
  178         CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
  179         "back-to-back calls to oom detector to start OOM");
  180 
  181 static int act_scan_laundry_weight = 3;
  182 SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
  183     &act_scan_laundry_weight, 0,
  184     "weight given to clean vs. dirty pages in active queue scans");
  185 
  186 static u_int vm_background_launder_rate = 4096;
  187 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
  188     &vm_background_launder_rate, 0,
  189     "background laundering rate, in kilobytes per second");
  190 
  191 static u_int vm_background_launder_max = 20 * 1024;
  192 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
  193     &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
  194 
  195 int vm_pageout_page_count = 32;
  196 
  197 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
  198 SYSCTL_INT(_vm, OID_AUTO, max_wired,
  199         CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
  200 
  201 static u_int isqrt(u_int num);
  202 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
  203     bool in_shortfall);
  204 static void vm_pageout_laundry_worker(void *arg);
  205 
  206 struct scan_state {
  207         struct vm_batchqueue bq;
  208         struct vm_pagequeue *pq;
  209         vm_page_t       marker;
  210         int             maxscan;
  211         int             scanned;
  212 };
  213 
  214 static void
  215 vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
  216     vm_page_t marker, vm_page_t after, int maxscan)
  217 {
  218 
  219         vm_pagequeue_assert_locked(pq);
  220         KASSERT((marker->aflags & PGA_ENQUEUED) == 0,
  221             ("marker %p already enqueued", marker));
  222 
  223         if (after == NULL)
  224                 TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
  225         else
  226                 TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
  227         vm_page_aflag_set(marker, PGA_ENQUEUED);
  228 
  229         vm_batchqueue_init(&ss->bq);
  230         ss->pq = pq;
  231         ss->marker = marker;
  232         ss->maxscan = maxscan;
  233         ss->scanned = 0;
  234         vm_pagequeue_unlock(pq);
  235 }
  236 
  237 static void
  238 vm_pageout_end_scan(struct scan_state *ss)
  239 {
  240         struct vm_pagequeue *pq;
  241 
  242         pq = ss->pq;
  243         vm_pagequeue_assert_locked(pq);
  244         KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
  245             ("marker %p not enqueued", ss->marker));
  246 
  247         TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
  248         vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
  249         pq->pq_pdpages += ss->scanned;
  250 }
  251 
  252 /*
  253  * Add a small number of queued pages to a batch queue for later processing
  254  * without the corresponding queue lock held.  The caller must have enqueued a
  255  * marker page at the desired start point for the scan.  Pages will be
  256  * physically dequeued if the caller so requests.  Otherwise, the returned
  257  * batch may contain marker pages, and it is up to the caller to handle them.
  258  *
  259  * When processing the batch queue, vm_page_queue() must be used to
  260  * determine whether the page has been logically dequeued by another thread.
  261  * Once this check is performed, the page lock guarantees that the page will
  262  * not be disassociated from the queue.
  263  */
  264 static __always_inline void
  265 vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
  266 {
  267         struct vm_pagequeue *pq;
  268         vm_page_t m, marker;
  269 
  270         marker = ss->marker;
  271         pq = ss->pq;
  272 
  273         KASSERT((marker->aflags & PGA_ENQUEUED) != 0,
  274             ("marker %p not enqueued", ss->marker));
  275 
  276         vm_pagequeue_lock(pq);
  277         for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
  278             ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
  279             m = TAILQ_NEXT(m, plinks.q), ss->scanned++) {
  280                 if ((m->flags & PG_MARKER) == 0) {
  281                         KASSERT((m->aflags & PGA_ENQUEUED) != 0,
  282                             ("page %p not enqueued", m));
  283                         KASSERT((m->flags & PG_FICTITIOUS) == 0,
  284                             ("Fictitious page %p cannot be in page queue", m));
  285                         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  286                             ("Unmanaged page %p cannot be in page queue", m));
  287                 } else if (dequeue)
  288                         continue;
  289 
  290                 (void)vm_batchqueue_insert(&ss->bq, m);
  291                 if (dequeue) {
  292                         TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
  293                         vm_page_aflag_clear(m, PGA_ENQUEUED);
  294                 }
  295         }
  296         TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
  297         if (__predict_true(m != NULL))
  298                 TAILQ_INSERT_BEFORE(m, marker, plinks.q);
  299         else
  300                 TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
  301         if (dequeue)
  302                 vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
  303         vm_pagequeue_unlock(pq);
  304 }
  305 
  306 /* Return the next page to be scanned, or NULL if the scan is complete. */
  307 static __always_inline vm_page_t
  308 vm_pageout_next(struct scan_state *ss, const bool dequeue)
  309 {
  310 
  311         if (ss->bq.bq_cnt == 0)
  312                 vm_pageout_collect_batch(ss, dequeue);
  313         return (vm_batchqueue_pop(&ss->bq));
  314 }
  315 
  316 /*
  317  * Scan for pages at adjacent offsets within the given page's object that are
  318  * eligible for laundering, form a cluster of these pages and the given page,
  319  * and launder that cluster.
  320  */
  321 static int
  322 vm_pageout_cluster(vm_page_t m)
  323 {
  324         vm_object_t object;
  325         vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
  326         vm_pindex_t pindex;
  327         int ib, is, page_base, pageout_count;
  328 
  329         vm_page_assert_locked(m);
  330         object = m->object;
  331         VM_OBJECT_ASSERT_WLOCKED(object);
  332         pindex = m->pindex;
  333 
  334         vm_page_assert_unbusied(m);
  335         KASSERT(!vm_page_held(m), ("page %p is held", m));
  336 
  337         pmap_remove_write(m);
  338         vm_page_unlock(m);
  339 
  340         mc[vm_pageout_page_count] = pb = ps = m;
  341         pageout_count = 1;
  342         page_base = vm_pageout_page_count;
  343         ib = 1;
  344         is = 1;
  345 
  346         /*
  347          * We can cluster only if the page is not clean, busy, or held, and
  348          * the page is in the laundry queue.
  349          *
  350          * During heavy mmap/modification loads the pageout
  351          * daemon can really fragment the underlying file
  352          * due to flushing pages out of order and not trying to
  353          * align the clusters (which leaves sporadic out-of-order
  354          * holes).  To solve this problem we do the reverse scan
  355          * first and attempt to align our cluster, then do a 
  356          * forward scan if room remains.
  357          */
  358 more:
  359         while (ib != 0 && pageout_count < vm_pageout_page_count) {
  360                 if (ib > pindex) {
  361                         ib = 0;
  362                         break;
  363                 }
  364                 if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
  365                         ib = 0;
  366                         break;
  367                 }
  368                 vm_page_test_dirty(p);
  369                 if (p->dirty == 0) {
  370                         ib = 0;
  371                         break;
  372                 }
  373                 vm_page_lock(p);
  374                 if (vm_page_held(p) || !vm_page_in_laundry(p)) {
  375                         vm_page_unlock(p);
  376                         ib = 0;
  377                         break;
  378                 }
  379                 pmap_remove_write(p);
  380                 vm_page_unlock(p);
  381                 mc[--page_base] = pb = p;
  382                 ++pageout_count;
  383                 ++ib;
  384 
  385                 /*
  386                  * We are at an alignment boundary.  Stop here, and switch
  387                  * directions.  Do not clear ib.
  388                  */
  389                 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
  390                         break;
  391         }
  392         while (pageout_count < vm_pageout_page_count && 
  393             pindex + is < object->size) {
  394                 if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
  395                         break;
  396                 vm_page_test_dirty(p);
  397                 if (p->dirty == 0)
  398                         break;
  399                 vm_page_lock(p);
  400                 if (vm_page_held(p) || !vm_page_in_laundry(p)) {
  401                         vm_page_unlock(p);
  402                         break;
  403                 }
  404                 pmap_remove_write(p);
  405                 vm_page_unlock(p);
  406                 mc[page_base + pageout_count] = ps = p;
  407                 ++pageout_count;
  408                 ++is;
  409         }
  410 
  411         /*
  412          * If we exhausted our forward scan, continue with the reverse scan
  413          * when possible, even past an alignment boundary.  This catches
  414          * boundary conditions.
  415          */
  416         if (ib != 0 && pageout_count < vm_pageout_page_count)
  417                 goto more;
  418 
  419         return (vm_pageout_flush(&mc[page_base], pageout_count,
  420             VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
  421 }
  422 
  423 /*
  424  * vm_pageout_flush() - launder the given pages
  425  *
  426  *      The given pages are laundered.  Note that we setup for the start of
  427  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
  428  *      reference count all in here rather then in the parent.  If we want
  429  *      the parent to do more sophisticated things we may have to change
  430  *      the ordering.
  431  *
  432  *      Returned runlen is the count of pages between mreq and first
  433  *      page after mreq with status VM_PAGER_AGAIN.
  434  *      *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
  435  *      for any page in runlen set.
  436  */
  437 int
  438 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
  439     boolean_t *eio)
  440 {
  441         vm_object_t object = mc[0]->object;
  442         int pageout_status[count];
  443         int numpagedout = 0;
  444         int i, runlen;
  445 
  446         VM_OBJECT_ASSERT_WLOCKED(object);
  447 
  448         /*
  449          * Initiate I/O.  Mark the pages busy and verify that they're valid
  450          * and read-only.
  451          *
  452          * We do not have to fixup the clean/dirty bits here... we can
  453          * allow the pager to do it after the I/O completes.
  454          *
  455          * NOTE! mc[i]->dirty may be partial or fragmented due to an
  456          * edge case with file fragments.
  457          */
  458         for (i = 0; i < count; i++) {
  459                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
  460                     ("vm_pageout_flush: partially invalid page %p index %d/%d",
  461                         mc[i], i, count));
  462                 KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
  463                     ("vm_pageout_flush: writeable page %p", mc[i]));
  464                 vm_page_sbusy(mc[i]);
  465         }
  466         vm_object_pip_add(object, count);
  467 
  468         vm_pager_put_pages(object, mc, count, flags, pageout_status);
  469 
  470         runlen = count - mreq;
  471         if (eio != NULL)
  472                 *eio = FALSE;
  473         for (i = 0; i < count; i++) {
  474                 vm_page_t mt = mc[i];
  475 
  476                 KASSERT(pageout_status[i] == VM_PAGER_PEND ||
  477                     !pmap_page_is_write_mapped(mt),
  478                     ("vm_pageout_flush: page %p is not write protected", mt));
  479                 switch (pageout_status[i]) {
  480                 case VM_PAGER_OK:
  481                         vm_page_lock(mt);
  482                         if (vm_page_in_laundry(mt))
  483                                 vm_page_deactivate_noreuse(mt);
  484                         vm_page_unlock(mt);
  485                         /* FALLTHROUGH */
  486                 case VM_PAGER_PEND:
  487                         numpagedout++;
  488                         break;
  489                 case VM_PAGER_BAD:
  490                         /*
  491                          * The page is outside the object's range.  We pretend
  492                          * that the page out worked and clean the page, so the
  493                          * changes will be lost if the page is reclaimed by
  494                          * the page daemon.
  495                          */
  496                         vm_page_undirty(mt);
  497                         vm_page_lock(mt);
  498                         if (vm_page_in_laundry(mt))
  499                                 vm_page_deactivate_noreuse(mt);
  500                         vm_page_unlock(mt);
  501                         break;
  502                 case VM_PAGER_ERROR:
  503                 case VM_PAGER_FAIL:
  504                         /*
  505                          * If the page couldn't be paged out to swap because the
  506                          * pager wasn't able to find space, place the page in
  507                          * the PQ_UNSWAPPABLE holding queue.  This is an
  508                          * optimization that prevents the page daemon from
  509                          * wasting CPU cycles on pages that cannot be reclaimed
  510                          * becase no swap device is configured.
  511                          *
  512                          * Otherwise, reactivate the page so that it doesn't
  513                          * clog the laundry and inactive queues.  (We will try
  514                          * paging it out again later.)
  515                          */
  516                         vm_page_lock(mt);
  517                         if (object->type == OBJT_SWAP &&
  518                             pageout_status[i] == VM_PAGER_FAIL) {
  519                                 vm_page_unswappable(mt);
  520                                 numpagedout++;
  521                         } else
  522                                 vm_page_activate(mt);
  523                         vm_page_unlock(mt);
  524                         if (eio != NULL && i >= mreq && i - mreq < runlen)
  525                                 *eio = TRUE;
  526                         break;
  527                 case VM_PAGER_AGAIN:
  528                         if (i >= mreq && i - mreq < runlen)
  529                                 runlen = i - mreq;
  530                         break;
  531                 }
  532 
  533                 /*
  534                  * If the operation is still going, leave the page busy to
  535                  * block all other accesses. Also, leave the paging in
  536                  * progress indicator set so that we don't attempt an object
  537                  * collapse.
  538                  */
  539                 if (pageout_status[i] != VM_PAGER_PEND) {
  540                         vm_object_pip_wakeup(object);
  541                         vm_page_sunbusy(mt);
  542                 }
  543         }
  544         if (prunlen != NULL)
  545                 *prunlen = runlen;
  546         return (numpagedout);
  547 }
  548 
  549 static void
  550 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
  551 {
  552 
  553         atomic_store_rel_int(&swapdev_enabled, 1);
  554 }
  555 
  556 static void
  557 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
  558 {
  559 
  560         if (swap_pager_nswapdev() == 1)
  561                 atomic_store_rel_int(&swapdev_enabled, 0);
  562 }
  563 
  564 /*
  565  * Attempt to acquire all of the necessary locks to launder a page and
  566  * then call through the clustering layer to PUTPAGES.  Wait a short
  567  * time for a vnode lock.
  568  *
  569  * Requires the page and object lock on entry, releases both before return.
  570  * Returns 0 on success and an errno otherwise.
  571  */
  572 static int
  573 vm_pageout_clean(vm_page_t m, int *numpagedout)
  574 {
  575         struct vnode *vp;
  576         struct mount *mp;
  577         vm_object_t object;
  578         vm_pindex_t pindex;
  579         int error, lockmode;
  580 
  581         vm_page_assert_locked(m);
  582         object = m->object;
  583         VM_OBJECT_ASSERT_WLOCKED(object);
  584         error = 0;
  585         vp = NULL;
  586         mp = NULL;
  587 
  588         /*
  589          * The object is already known NOT to be dead.   It
  590          * is possible for the vget() to block the whole
  591          * pageout daemon, but the new low-memory handling
  592          * code should prevent it.
  593          *
  594          * We can't wait forever for the vnode lock, we might
  595          * deadlock due to a vn_read() getting stuck in
  596          * vm_wait while holding this vnode.  We skip the 
  597          * vnode if we can't get it in a reasonable amount
  598          * of time.
  599          */
  600         if (object->type == OBJT_VNODE) {
  601                 vm_page_unlock(m);
  602                 vp = object->handle;
  603                 if (vp->v_type == VREG &&
  604                     vn_start_write(vp, &mp, V_NOWAIT) != 0) {
  605                         mp = NULL;
  606                         error = EDEADLK;
  607                         goto unlock_all;
  608                 }
  609                 KASSERT(mp != NULL,
  610                     ("vp %p with NULL v_mount", vp));
  611                 vm_object_reference_locked(object);
  612                 pindex = m->pindex;
  613                 VM_OBJECT_WUNLOCK(object);
  614                 lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
  615                     LK_SHARED : LK_EXCLUSIVE;
  616                 if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
  617                         vp = NULL;
  618                         error = EDEADLK;
  619                         goto unlock_mp;
  620                 }
  621                 VM_OBJECT_WLOCK(object);
  622 
  623                 /*
  624                  * Ensure that the object and vnode were not disassociated
  625                  * while locks were dropped.
  626                  */
  627                 if (vp->v_object != object) {
  628                         error = ENOENT;
  629                         goto unlock_all;
  630                 }
  631                 vm_page_lock(m);
  632 
  633                 /*
  634                  * While the object and page were unlocked, the page
  635                  * may have been:
  636                  * (1) moved to a different queue,
  637                  * (2) reallocated to a different object,
  638                  * (3) reallocated to a different offset, or
  639                  * (4) cleaned.
  640                  */
  641                 if (!vm_page_in_laundry(m) || m->object != object ||
  642                     m->pindex != pindex || m->dirty == 0) {
  643                         vm_page_unlock(m);
  644                         error = ENXIO;
  645                         goto unlock_all;
  646                 }
  647 
  648                 /*
  649                  * The page may have been busied or referenced while the object
  650                  * and page locks were released.
  651                  */
  652                 if (vm_page_busied(m) || vm_page_held(m)) {
  653                         vm_page_unlock(m);
  654                         error = EBUSY;
  655                         goto unlock_all;
  656                 }
  657         }
  658 
  659         /*
  660          * If a page is dirty, then it is either being washed
  661          * (but not yet cleaned) or it is still in the
  662          * laundry.  If it is still in the laundry, then we
  663          * start the cleaning operation. 
  664          */
  665         if ((*numpagedout = vm_pageout_cluster(m)) == 0)
  666                 error = EIO;
  667 
  668 unlock_all:
  669         VM_OBJECT_WUNLOCK(object);
  670 
  671 unlock_mp:
  672         vm_page_lock_assert(m, MA_NOTOWNED);
  673         if (mp != NULL) {
  674                 if (vp != NULL)
  675                         vput(vp);
  676                 vm_object_deallocate(object);
  677                 vn_finished_write(mp);
  678         }
  679 
  680         return (error);
  681 }
  682 
  683 /*
  684  * Attempt to launder the specified number of pages.
  685  *
  686  * Returns the number of pages successfully laundered.
  687  */
  688 static int
  689 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
  690 {
  691         struct scan_state ss;
  692         struct vm_pagequeue *pq;
  693         struct mtx *mtx;
  694         vm_object_t object;
  695         vm_page_t m, marker;
  696         int act_delta, error, numpagedout, queue, starting_target;
  697         int vnodes_skipped;
  698         bool obj_locked, pageout_ok;
  699 
  700         mtx = NULL;
  701         obj_locked = false;
  702         object = NULL;
  703         starting_target = launder;
  704         vnodes_skipped = 0;
  705 
  706         /*
  707          * Scan the laundry queues for pages eligible to be laundered.  We stop
  708          * once the target number of dirty pages have been laundered, or once
  709          * we've reached the end of the queue.  A single iteration of this loop
  710          * may cause more than one page to be laundered because of clustering.
  711          *
  712          * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
  713          * swap devices are configured.
  714          */
  715         if (atomic_load_acq_int(&swapdev_enabled))
  716                 queue = PQ_UNSWAPPABLE;
  717         else
  718                 queue = PQ_LAUNDRY;
  719 
  720 scan:
  721         marker = &vmd->vmd_markers[queue];
  722         pq = &vmd->vmd_pagequeues[queue];
  723         vm_pagequeue_lock(pq);
  724         vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
  725         while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
  726                 if (__predict_false((m->flags & PG_MARKER) != 0))
  727                         continue;
  728 
  729                 vm_page_change_lock(m, &mtx);
  730 
  731 recheck:
  732                 /*
  733                  * The page may have been disassociated from the queue
  734                  * while locks were dropped.
  735                  */
  736                 if (vm_page_queue(m) != queue)
  737                         continue;
  738 
  739                 /*
  740                  * A requeue was requested, so this page gets a second
  741                  * chance.
  742                  */
  743                 if ((m->aflags & PGA_REQUEUE) != 0) {
  744                         vm_page_requeue(m);
  745                         continue;
  746                 }
  747 
  748                 /*
  749                  * Held pages are essentially stuck in the queue.
  750                  *
  751                  * Wired pages may not be freed.  Complete their removal
  752                  * from the queue now to avoid needless revisits during
  753                  * future scans.
  754                  */
  755                 if (m->hold_count != 0)
  756                         continue;
  757                 if (m->wire_count != 0) {
  758                         vm_page_dequeue_deferred(m);
  759                         continue;
  760                 }
  761 
  762                 if (object != m->object) {
  763                         if (obj_locked) {
  764                                 VM_OBJECT_WUNLOCK(object);
  765                                 obj_locked = false;
  766                         }
  767                         object = m->object;
  768                 }
  769                 if (!obj_locked) {
  770                         if (!VM_OBJECT_TRYWLOCK(object)) {
  771                                 mtx_unlock(mtx);
  772                                 /* Depends on type-stability. */
  773                                 VM_OBJECT_WLOCK(object);
  774                                 obj_locked = true;
  775                                 mtx_lock(mtx);
  776                                 goto recheck;
  777                         } else
  778                                 obj_locked = true;
  779                 }
  780 
  781                 if (vm_page_busied(m))
  782                         continue;
  783 
  784                 /*
  785                  * Invalid pages can be easily freed.  They cannot be
  786                  * mapped; vm_page_free() asserts this.
  787                  */
  788                 if (m->valid == 0)
  789                         goto free_page;
  790 
  791                 /*
  792                  * If the page has been referenced and the object is not dead,
  793                  * reactivate or requeue the page depending on whether the
  794                  * object is mapped.
  795                  *
  796                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
  797                  * that a reference from a concurrently destroyed mapping is
  798                  * observed here and now.
  799                  */
  800                 if (object->ref_count != 0)
  801                         act_delta = pmap_ts_referenced(m);
  802                 else {
  803                         KASSERT(!pmap_page_is_mapped(m),
  804                             ("page %p is mapped", m));
  805                         act_delta = 0;
  806                 }
  807                 if ((m->aflags & PGA_REFERENCED) != 0) {
  808                         vm_page_aflag_clear(m, PGA_REFERENCED);
  809                         act_delta++;
  810                 }
  811                 if (act_delta != 0) {
  812                         if (object->ref_count != 0) {
  813                                 VM_CNT_INC(v_reactivated);
  814                                 vm_page_activate(m);
  815 
  816                                 /*
  817                                  * Increase the activation count if the page
  818                                  * was referenced while in the laundry queue.
  819                                  * This makes it less likely that the page will
  820                                  * be returned prematurely to the inactive
  821                                  * queue.
  822                                  */
  823                                 m->act_count += act_delta + ACT_ADVANCE;
  824 
  825                                 /*
  826                                  * If this was a background laundering, count
  827                                  * activated pages towards our target.  The
  828                                  * purpose of background laundering is to ensure
  829                                  * that pages are eventually cycled through the
  830                                  * laundry queue, and an activation is a valid
  831                                  * way out.
  832                                  */
  833                                 if (!in_shortfall)
  834                                         launder--;
  835                                 continue;
  836                         } else if ((object->flags & OBJ_DEAD) == 0) {
  837                                 vm_page_requeue(m);
  838                                 continue;
  839                         }
  840                 }
  841 
  842                 /*
  843                  * If the page appears to be clean at the machine-independent
  844                  * layer, then remove all of its mappings from the pmap in
  845                  * anticipation of freeing it.  If, however, any of the page's
  846                  * mappings allow write access, then the page may still be
  847                  * modified until the last of those mappings are removed.
  848                  */
  849                 if (object->ref_count != 0) {
  850                         vm_page_test_dirty(m);
  851                         if (m->dirty == 0)
  852                                 pmap_remove_all(m);
  853                 }
  854 
  855                 /*
  856                  * Clean pages are freed, and dirty pages are paged out unless
  857                  * they belong to a dead object.  Requeueing dirty pages from
  858                  * dead objects is pointless, as they are being paged out and
  859                  * freed by the thread that destroyed the object.
  860                  */
  861                 if (m->dirty == 0) {
  862 free_page:
  863                         vm_page_free(m);
  864                         VM_CNT_INC(v_dfree);
  865                 } else if ((object->flags & OBJ_DEAD) == 0) {
  866                         if (object->type != OBJT_SWAP &&
  867                             object->type != OBJT_DEFAULT)
  868                                 pageout_ok = true;
  869                         else if (disable_swap_pageouts)
  870                                 pageout_ok = false;
  871                         else
  872                                 pageout_ok = true;
  873                         if (!pageout_ok) {
  874                                 vm_page_requeue(m);
  875                                 continue;
  876                         }
  877 
  878                         /*
  879                          * Form a cluster with adjacent, dirty pages from the
  880                          * same object, and page out that entire cluster.
  881                          *
  882                          * The adjacent, dirty pages must also be in the
  883                          * laundry.  However, their mappings are not checked
  884                          * for new references.  Consequently, a recently
  885                          * referenced page may be paged out.  However, that
  886                          * page will not be prematurely reclaimed.  After page
  887                          * out, the page will be placed in the inactive queue,
  888                          * where any new references will be detected and the
  889                          * page reactivated.
  890                          */
  891                         error = vm_pageout_clean(m, &numpagedout);
  892                         if (error == 0) {
  893                                 launder -= numpagedout;
  894                                 ss.scanned += numpagedout;
  895                         } else if (error == EDEADLK) {
  896                                 pageout_lock_miss++;
  897                                 vnodes_skipped++;
  898                         }
  899                         mtx = NULL;
  900                         obj_locked = false;
  901                 }
  902         }
  903         if (mtx != NULL) {
  904                 mtx_unlock(mtx);
  905                 mtx = NULL;
  906         }
  907         if (obj_locked) {
  908                 VM_OBJECT_WUNLOCK(object);
  909                 obj_locked = false;
  910         }
  911         vm_pagequeue_lock(pq);
  912         vm_pageout_end_scan(&ss);
  913         vm_pagequeue_unlock(pq);
  914 
  915         if (launder > 0 && queue == PQ_UNSWAPPABLE) {
  916                 queue = PQ_LAUNDRY;
  917                 goto scan;
  918         }
  919 
  920         /*
  921          * Wakeup the sync daemon if we skipped a vnode in a writeable object
  922          * and we didn't launder enough pages.
  923          */
  924         if (vnodes_skipped > 0 && launder > 0)
  925                 (void)speedup_syncer();
  926 
  927         return (starting_target - launder);
  928 }
  929 
  930 /*
  931  * Compute the integer square root.
  932  */
  933 static u_int
  934 isqrt(u_int num)
  935 {
  936         u_int bit, root, tmp;
  937 
  938         bit = 1u << ((NBBY * sizeof(u_int)) - 2);
  939         while (bit > num)
  940                 bit >>= 2;
  941         root = 0;
  942         while (bit != 0) {
  943                 tmp = root + bit;
  944                 root >>= 1;
  945                 if (num >= tmp) {
  946                         num -= tmp;
  947                         root += bit;
  948                 }
  949                 bit >>= 2;
  950         }
  951         return (root);
  952 }
  953 
  954 /*
  955  * Perform the work of the laundry thread: periodically wake up and determine
  956  * whether any pages need to be laundered.  If so, determine the number of pages
  957  * that need to be laundered, and launder them.
  958  */
  959 static void
  960 vm_pageout_laundry_worker(void *arg)
  961 {
  962         struct vm_domain *vmd;
  963         struct vm_pagequeue *pq;
  964         uint64_t nclean, ndirty, nfreed;
  965         int domain, last_target, launder, shortfall, shortfall_cycle, target;
  966         bool in_shortfall;
  967 
  968         domain = (uintptr_t)arg;
  969         vmd = VM_DOMAIN(domain);
  970         pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
  971         KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
  972 
  973         shortfall = 0;
  974         in_shortfall = false;
  975         shortfall_cycle = 0;
  976         target = 0;
  977         nfreed = 0;
  978 
  979         /*
  980          * Calls to these handlers are serialized by the swap syscall lock.
  981          */
  982         (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,
  983             EVENTHANDLER_PRI_ANY);
  984         (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,
  985             EVENTHANDLER_PRI_ANY);
  986 
  987         /*
  988          * The pageout laundry worker is never done, so loop forever.
  989          */
  990         for (;;) {
  991                 KASSERT(target >= 0, ("negative target %d", target));
  992                 KASSERT(shortfall_cycle >= 0,
  993                     ("negative cycle %d", shortfall_cycle));
  994                 launder = 0;
  995 
  996                 /*
  997                  * First determine whether we need to launder pages to meet a
  998                  * shortage of free pages.
  999                  */
 1000                 if (shortfall > 0) {
 1001                         in_shortfall = true;
 1002                         shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
 1003                         target = shortfall;
 1004                 } else if (!in_shortfall)
 1005                         goto trybackground;
 1006                 else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) {
 1007                         /*
 1008                          * We recently entered shortfall and began laundering
 1009                          * pages.  If we have completed that laundering run
 1010                          * (and we are no longer in shortfall) or we have met
 1011                          * our laundry target through other activity, then we
 1012                          * can stop laundering pages.
 1013                          */
 1014                         in_shortfall = false;
 1015                         target = 0;
 1016                         goto trybackground;
 1017                 }
 1018                 launder = target / shortfall_cycle--;
 1019                 goto dolaundry;
 1020 
 1021                 /*
 1022                  * There's no immediate need to launder any pages; see if we
 1023                  * meet the conditions to perform background laundering:
 1024                  *
 1025                  * 1. The ratio of dirty to clean inactive pages exceeds the
 1026                  *    background laundering threshold, or
 1027                  * 2. we haven't yet reached the target of the current
 1028                  *    background laundering run.
 1029                  *
 1030                  * The background laundering threshold is not a constant.
 1031                  * Instead, it is a slowly growing function of the number of
 1032                  * clean pages freed by the page daemon since the last
 1033                  * background laundering.  Thus, as the ratio of dirty to
 1034                  * clean inactive pages grows, the amount of memory pressure
 1035                  * required to trigger laundering decreases.  We ensure
 1036                  * that the threshold is non-zero after an inactive queue
 1037                  * scan, even if that scan failed to free a single clean page.
 1038                  */
 1039 trybackground:
 1040                 nclean = vmd->vmd_free_count +
 1041                     vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;
 1042                 ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;
 1043                 if (target == 0 && ndirty * isqrt(howmany(nfreed + 1,
 1044                     vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) {
 1045                         target = vmd->vmd_background_launder_target;
 1046                 }
 1047 
 1048                 /*
 1049                  * We have a non-zero background laundering target.  If we've
 1050                  * laundered up to our maximum without observing a page daemon
 1051                  * request, just stop.  This is a safety belt that ensures we
 1052                  * don't launder an excessive amount if memory pressure is low
 1053                  * and the ratio of dirty to clean pages is large.  Otherwise,
 1054                  * proceed at the background laundering rate.
 1055                  */
 1056                 if (target > 0) {
 1057                         if (nfreed > 0) {
 1058                                 nfreed = 0;
 1059                                 last_target = target;
 1060                         } else if (last_target - target >=
 1061                             vm_background_launder_max * PAGE_SIZE / 1024) {
 1062                                 target = 0;
 1063                         }
 1064                         launder = vm_background_launder_rate * PAGE_SIZE / 1024;
 1065                         launder /= VM_LAUNDER_RATE;
 1066                         if (launder > target)
 1067                                 launder = target;
 1068                 }
 1069 
 1070 dolaundry:
 1071                 if (launder > 0) {
 1072                         /*
 1073                          * Because of I/O clustering, the number of laundered
 1074                          * pages could exceed "target" by the maximum size of
 1075                          * a cluster minus one. 
 1076                          */
 1077                         target -= min(vm_pageout_launder(vmd, launder,
 1078                             in_shortfall), target);
 1079                         pause("laundp", hz / VM_LAUNDER_RATE);
 1080                 }
 1081 
 1082                 /*
 1083                  * If we're not currently laundering pages and the page daemon
 1084                  * hasn't posted a new request, sleep until the page daemon
 1085                  * kicks us.
 1086                  */
 1087                 vm_pagequeue_lock(pq);
 1088                 if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)
 1089                         (void)mtx_sleep(&vmd->vmd_laundry_request,
 1090                             vm_pagequeue_lockptr(pq), PVM, "launds", 0);
 1091 
 1092                 /*
 1093                  * If the pagedaemon has indicated that it's in shortfall, start
 1094                  * a shortfall laundering unless we're already in the middle of
 1095                  * one.  This may preempt a background laundering.
 1096                  */
 1097                 if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&
 1098                     (!in_shortfall || shortfall_cycle == 0)) {
 1099                         shortfall = vm_laundry_target(vmd) +
 1100                             vmd->vmd_pageout_deficit;
 1101                         target = 0;
 1102                 } else
 1103                         shortfall = 0;
 1104 
 1105                 if (target == 0)
 1106                         vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;
 1107                 nfreed += vmd->vmd_clean_pages_freed;
 1108                 vmd->vmd_clean_pages_freed = 0;
 1109                 vm_pagequeue_unlock(pq);
 1110         }
 1111 }
 1112 
 1113 /*
 1114  * Compute the number of pages we want to try to move from the
 1115  * active queue to either the inactive or laundry queue.
 1116  *
 1117  * When scanning active pages during a shortage, we make clean pages
 1118  * count more heavily towards the page shortage than dirty pages.
 1119  * This is because dirty pages must be laundered before they can be
 1120  * reused and thus have less utility when attempting to quickly
 1121  * alleviate a free page shortage.  However, this weighting also
 1122  * causes the scan to deactivate dirty pages more aggressively,
 1123  * improving the effectiveness of clustering.
 1124  */
 1125 static int
 1126 vm_pageout_active_target(struct vm_domain *vmd)
 1127 {
 1128         int shortage;
 1129 
 1130         shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) -
 1131             (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt +
 1132             vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight);
 1133         shortage *= act_scan_laundry_weight;
 1134         return (shortage);
 1135 }
 1136 
 1137 /*
 1138  * Scan the active queue.  If there is no shortage of inactive pages, scan a
 1139  * small portion of the queue in order to maintain quasi-LRU.
 1140  */
 1141 static void
 1142 vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
 1143 {
 1144         struct scan_state ss;
 1145         struct mtx *mtx;
 1146         vm_page_t m, marker;
 1147         struct vm_pagequeue *pq;
 1148         long min_scan;
 1149         int act_delta, max_scan, scan_tick;
 1150 
 1151         marker = &vmd->vmd_markers[PQ_ACTIVE];
 1152         pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 1153         vm_pagequeue_lock(pq);
 1154 
 1155         /*
 1156          * If we're just idle polling attempt to visit every
 1157          * active page within 'update_period' seconds.
 1158          */
 1159         scan_tick = ticks;
 1160         if (vm_pageout_update_period != 0) {
 1161                 min_scan = pq->pq_cnt;
 1162                 min_scan *= scan_tick - vmd->vmd_last_active_scan;
 1163                 min_scan /= hz * vm_pageout_update_period;
 1164         } else
 1165                 min_scan = 0;
 1166         if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0))
 1167                 vmd->vmd_last_active_scan = scan_tick;
 1168 
 1169         /*
 1170          * Scan the active queue for pages that can be deactivated.  Update
 1171          * the per-page activity counter and use it to identify deactivation
 1172          * candidates.  Held pages may be deactivated.
 1173          *
 1174          * To avoid requeuing each page that remains in the active queue, we
 1175          * implement the CLOCK algorithm.  To keep the implementation of the
 1176          * enqueue operation consistent for all page queues, we use two hands,
 1177          * represented by marker pages. Scans begin at the first hand, which
 1178          * precedes the second hand in the queue.  When the two hands meet,
 1179          * they are moved back to the head and tail of the queue, respectively,
 1180          * and scanning resumes.
 1181          */
 1182         max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan;
 1183         mtx = NULL;
 1184 act_scan:
 1185         vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
 1186         while ((m = vm_pageout_next(&ss, false)) != NULL) {
 1187                 if (__predict_false(m == &vmd->vmd_clock[1])) {
 1188                         vm_pagequeue_lock(pq);
 1189                         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 1190                         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
 1191                         TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
 1192                             plinks.q);
 1193                         TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
 1194                             plinks.q);
 1195                         max_scan -= ss.scanned;
 1196                         vm_pageout_end_scan(&ss);
 1197                         goto act_scan;
 1198                 }
 1199                 if (__predict_false((m->flags & PG_MARKER) != 0))
 1200                         continue;
 1201 
 1202                 vm_page_change_lock(m, &mtx);
 1203 
 1204                 /*
 1205                  * The page may have been disassociated from the queue
 1206                  * while locks were dropped.
 1207                  */
 1208                 if (vm_page_queue(m) != PQ_ACTIVE)
 1209                         continue;
 1210 
 1211                 /*
 1212                  * Wired pages are dequeued lazily.
 1213                  */
 1214                 if (m->wire_count != 0) {
 1215                         vm_page_dequeue_deferred(m);
 1216                         continue;
 1217                 }
 1218 
 1219                 /*
 1220                  * Check to see "how much" the page has been used.
 1221                  *
 1222                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 1223                  * that a reference from a concurrently destroyed mapping is
 1224                  * observed here and now.
 1225                  *
 1226                  * Perform an unsynchronized object ref count check.  While
 1227                  * the page lock ensures that the page is not reallocated to
 1228                  * another object, in particular, one with unmanaged mappings
 1229                  * that cannot support pmap_ts_referenced(), two races are,
 1230                  * nonetheless, possible:
 1231                  * 1) The count was transitioning to zero, but we saw a non-
 1232                  *    zero value.  pmap_ts_referenced() will return zero
 1233                  *    because the page is not mapped.
 1234                  * 2) The count was transitioning to one, but we saw zero.
 1235                  *    This race delays the detection of a new reference.  At
 1236                  *    worst, we will deactivate and reactivate the page.
 1237                  */
 1238                 if (m->object->ref_count != 0)
 1239                         act_delta = pmap_ts_referenced(m);
 1240                 else
 1241                         act_delta = 0;
 1242                 if ((m->aflags & PGA_REFERENCED) != 0) {
 1243                         vm_page_aflag_clear(m, PGA_REFERENCED);
 1244                         act_delta++;
 1245                 }
 1246 
 1247                 /*
 1248                  * Advance or decay the act_count based on recent usage.
 1249                  */
 1250                 if (act_delta != 0) {
 1251                         m->act_count += ACT_ADVANCE + act_delta;
 1252                         if (m->act_count > ACT_MAX)
 1253                                 m->act_count = ACT_MAX;
 1254                 } else
 1255                         m->act_count -= min(m->act_count, ACT_DECLINE);
 1256 
 1257                 if (m->act_count == 0) {
 1258                         /*
 1259                          * When not short for inactive pages, let dirty pages go
 1260                          * through the inactive queue before moving to the
 1261                          * laundry queues.  This gives them some extra time to
 1262                          * be reactivated, potentially avoiding an expensive
 1263                          * pageout.  However, during a page shortage, the
 1264                          * inactive queue is necessarily small, and so dirty
 1265                          * pages would only spend a trivial amount of time in
 1266                          * the inactive queue.  Therefore, we might as well
 1267                          * place them directly in the laundry queue to reduce
 1268                          * queuing overhead.
 1269                          */
 1270                         if (page_shortage <= 0)
 1271                                 vm_page_deactivate(m);
 1272                         else {
 1273                                 /*
 1274                                  * Calling vm_page_test_dirty() here would
 1275                                  * require acquisition of the object's write
 1276                                  * lock.  However, during a page shortage,
 1277                                  * directing dirty pages into the laundry
 1278                                  * queue is only an optimization and not a
 1279                                  * requirement.  Therefore, we simply rely on
 1280                                  * the opportunistic updates to the page's
 1281                                  * dirty field by the pmap.
 1282                                  */
 1283                                 if (m->dirty == 0) {
 1284                                         vm_page_deactivate(m);
 1285                                         page_shortage -=
 1286                                             act_scan_laundry_weight;
 1287                                 } else {
 1288                                         vm_page_launder(m);
 1289                                         page_shortage--;
 1290                                 }
 1291                         }
 1292                 }
 1293         }
 1294         if (mtx != NULL) {
 1295                 mtx_unlock(mtx);
 1296                 mtx = NULL;
 1297         }
 1298         vm_pagequeue_lock(pq);
 1299         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 1300         TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
 1301         vm_pageout_end_scan(&ss);
 1302         vm_pagequeue_unlock(pq);
 1303 }
 1304 
 1305 static int
 1306 vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m)
 1307 {
 1308         struct vm_domain *vmd;
 1309 
 1310         if (m->queue != PQ_INACTIVE || (m->aflags & PGA_ENQUEUED) != 0)
 1311                 return (0);
 1312         vm_page_aflag_set(m, PGA_ENQUEUED);
 1313         if ((m->aflags & PGA_REQUEUE_HEAD) != 0) {
 1314                 vmd = vm_pagequeue_domain(m);
 1315                 TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
 1316                 vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 1317         } else if ((m->aflags & PGA_REQUEUE) != 0) {
 1318                 TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q);
 1319                 vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 1320         } else
 1321                 TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q);
 1322         return (1);
 1323 }
 1324 
 1325 /*
 1326  * Re-add stuck pages to the inactive queue.  We will examine them again
 1327  * during the next scan.  If the queue state of a page has changed since
 1328  * it was physically removed from the page queue in
 1329  * vm_pageout_collect_batch(), don't do anything with that page.
 1330  */
 1331 static void
 1332 vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
 1333     vm_page_t m)
 1334 {
 1335         struct vm_pagequeue *pq;
 1336         int delta;
 1337 
 1338         delta = 0;
 1339         pq = ss->pq;
 1340 
 1341         if (m != NULL) {
 1342                 if (vm_batchqueue_insert(bq, m))
 1343                         return;
 1344                 vm_pagequeue_lock(pq);
 1345                 delta += vm_pageout_reinsert_inactive_page(ss, m);
 1346         } else
 1347                 vm_pagequeue_lock(pq);
 1348         while ((m = vm_batchqueue_pop(bq)) != NULL)
 1349                 delta += vm_pageout_reinsert_inactive_page(ss, m);
 1350         vm_pagequeue_cnt_add(pq, delta);
 1351         vm_pagequeue_unlock(pq);
 1352         vm_batchqueue_init(bq);
 1353 }
 1354 
 1355 /*
 1356  * Attempt to reclaim the requested number of pages from the inactive queue.
 1357  * Returns true if the shortage was addressed.
 1358  */
 1359 static int
 1360 vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
 1361     int *addl_shortage)
 1362 {
 1363         struct scan_state ss;
 1364         struct vm_batchqueue rq;
 1365         struct mtx *mtx;
 1366         vm_page_t m, marker;
 1367         struct vm_pagequeue *pq;
 1368         vm_object_t object;
 1369         int act_delta, addl_page_shortage, deficit, page_shortage;
 1370         int starting_page_shortage;
 1371         bool obj_locked;
 1372 
 1373         /*
 1374          * The addl_page_shortage is an estimate of the number of temporarily
 1375          * stuck pages in the inactive queue.  In other words, the
 1376          * number of pages from the inactive count that should be
 1377          * discounted in setting the target for the active queue scan.
 1378          */
 1379         addl_page_shortage = 0;
 1380 
 1381         /*
 1382          * vmd_pageout_deficit counts the number of pages requested in
 1383          * allocations that failed because of a free page shortage.  We assume
 1384          * that the allocations will be reattempted and thus include the deficit
 1385          * in our scan target.
 1386          */
 1387         deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
 1388         starting_page_shortage = page_shortage = shortage + deficit;
 1389 
 1390         mtx = NULL;
 1391         obj_locked = false;
 1392         object = NULL;
 1393         vm_batchqueue_init(&rq);
 1394 
 1395         /*
 1396          * Start scanning the inactive queue for pages that we can free.  The
 1397          * scan will stop when we reach the target or we have scanned the
 1398          * entire queue.  (Note that m->act_count is not used to make
 1399          * decisions for the inactive queue, only for the active queue.)
 1400          */
 1401         marker = &vmd->vmd_markers[PQ_INACTIVE];
 1402         pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 1403         vm_pagequeue_lock(pq);
 1404         vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 1405         while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) {
 1406                 KASSERT((m->flags & PG_MARKER) == 0,
 1407                     ("marker page %p was dequeued", m));
 1408 
 1409                 vm_page_change_lock(m, &mtx);
 1410 
 1411 recheck:
 1412                 /*
 1413                  * The page may have been disassociated from the queue
 1414                  * while locks were dropped.
 1415                  */
 1416                 if (vm_page_queue(m) != PQ_INACTIVE) {
 1417                         addl_page_shortage++;
 1418                         continue;
 1419                 }
 1420 
 1421                 /*
 1422                  * The page was re-enqueued after the page queue lock was
 1423                  * dropped, or a requeue was requested.  This page gets a second
 1424                  * chance.
 1425                  */
 1426                 if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE |
 1427                     PGA_REQUEUE_HEAD)) != 0)
 1428                         goto reinsert;
 1429 
 1430                 /*
 1431                  * Held pages are essentially stuck in the queue.  So,
 1432                  * they ought to be discounted from the inactive count.
 1433                  * See the description of addl_page_shortage above.
 1434                  *
 1435                  * Wired pages may not be freed.  Complete their removal
 1436                  * from the queue now to avoid needless revisits during
 1437                  * future scans.
 1438                  */
 1439                 if (m->hold_count != 0) {
 1440                         addl_page_shortage++;
 1441                         goto reinsert;
 1442                 }
 1443                 if (m->wire_count != 0) {
 1444                         vm_page_dequeue_deferred(m);
 1445                         continue;
 1446                 }
 1447 
 1448                 if (object != m->object) {
 1449                         if (obj_locked) {
 1450                                 VM_OBJECT_WUNLOCK(object);
 1451                                 obj_locked = false;
 1452                         }
 1453                         object = m->object;
 1454                 }
 1455                 if (!obj_locked) {
 1456                         if (!VM_OBJECT_TRYWLOCK(object)) {
 1457                                 mtx_unlock(mtx);
 1458                                 /* Depends on type-stability. */
 1459                                 VM_OBJECT_WLOCK(object);
 1460                                 obj_locked = true;
 1461                                 mtx_lock(mtx);
 1462                                 goto recheck;
 1463                         } else
 1464                                 obj_locked = true;
 1465                 }
 1466 
 1467                 if (vm_page_busied(m)) {
 1468                         /*
 1469                          * Don't mess with busy pages.  Leave them at
 1470                          * the front of the queue.  Most likely, they
 1471                          * are being paged out and will leave the
 1472                          * queue shortly after the scan finishes.  So,
 1473                          * they ought to be discounted from the
 1474                          * inactive count.
 1475                          */
 1476                         addl_page_shortage++;
 1477                         goto reinsert;
 1478                 }
 1479 
 1480                 /*
 1481                  * Invalid pages can be easily freed. They cannot be
 1482                  * mapped, vm_page_free() asserts this.
 1483                  */
 1484                 if (m->valid == 0)
 1485                         goto free_page;
 1486 
 1487                 /*
 1488                  * If the page has been referenced and the object is not dead,
 1489                  * reactivate or requeue the page depending on whether the
 1490                  * object is mapped.
 1491                  *
 1492                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 1493                  * that a reference from a concurrently destroyed mapping is
 1494                  * observed here and now.
 1495                  */
 1496                 if (object->ref_count != 0)
 1497                         act_delta = pmap_ts_referenced(m);
 1498                 else {
 1499                         KASSERT(!pmap_page_is_mapped(m),
 1500                             ("page %p is mapped", m));
 1501                         act_delta = 0;
 1502                 }
 1503                 if ((m->aflags & PGA_REFERENCED) != 0) {
 1504                         vm_page_aflag_clear(m, PGA_REFERENCED);
 1505                         act_delta++;
 1506                 }
 1507                 if (act_delta != 0) {
 1508                         if (object->ref_count != 0) {
 1509                                 VM_CNT_INC(v_reactivated);
 1510                                 vm_page_activate(m);
 1511 
 1512                                 /*
 1513                                  * Increase the activation count if the page
 1514                                  * was referenced while in the inactive queue.
 1515                                  * This makes it less likely that the page will
 1516                                  * be returned prematurely to the inactive
 1517                                  * queue.
 1518                                  */
 1519                                 m->act_count += act_delta + ACT_ADVANCE;
 1520                                 continue;
 1521                         } else if ((object->flags & OBJ_DEAD) == 0) {
 1522                                 vm_page_aflag_set(m, PGA_REQUEUE);
 1523                                 goto reinsert;
 1524                         }
 1525                 }
 1526 
 1527                 /*
 1528                  * If the page appears to be clean at the machine-independent
 1529                  * layer, then remove all of its mappings from the pmap in
 1530                  * anticipation of freeing it.  If, however, any of the page's
 1531                  * mappings allow write access, then the page may still be
 1532                  * modified until the last of those mappings are removed.
 1533                  */
 1534                 if (object->ref_count != 0) {
 1535                         vm_page_test_dirty(m);
 1536                         if (m->dirty == 0)
 1537                                 pmap_remove_all(m);
 1538                 }
 1539 
 1540                 /*
 1541                  * Clean pages can be freed, but dirty pages must be sent back
 1542                  * to the laundry, unless they belong to a dead object.
 1543                  * Requeueing dirty pages from dead objects is pointless, as
 1544                  * they are being paged out and freed by the thread that
 1545                  * destroyed the object.
 1546                  */
 1547                 if (m->dirty == 0) {
 1548 free_page:
 1549                         /*
 1550                          * Because we dequeued the page and have already
 1551                          * checked for concurrent dequeue and enqueue
 1552                          * requests, we can safely disassociate the page
 1553                          * from the inactive queue.
 1554                          */
 1555                         KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
 1556                             ("page %p has queue state", m));
 1557                         m->queue = PQ_NONE;
 1558                         vm_page_free(m);
 1559                         page_shortage--;
 1560                 } else if ((object->flags & OBJ_DEAD) == 0)
 1561                         vm_page_launder(m);
 1562                 continue;
 1563 reinsert:
 1564                 vm_pageout_reinsert_inactive(&ss, &rq, m);
 1565         }
 1566         if (mtx != NULL) {
 1567                 mtx_unlock(mtx);
 1568                 mtx = NULL;
 1569         }
 1570         if (obj_locked) {
 1571                 VM_OBJECT_WUNLOCK(object);
 1572                 obj_locked = false;
 1573         }
 1574         vm_pageout_reinsert_inactive(&ss, &rq, NULL);
 1575         vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
 1576         vm_pagequeue_lock(pq);
 1577         vm_pageout_end_scan(&ss);
 1578         vm_pagequeue_unlock(pq);
 1579 
 1580         VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage);
 1581 
 1582         /*
 1583          * Wake up the laundry thread so that it can perform any needed
 1584          * laundering.  If we didn't meet our target, we're in shortfall and
 1585          * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
 1586          * swap devices are configured, the laundry thread has no work to do, so
 1587          * don't bother waking it up.
 1588          *
 1589          * The laundry thread uses the number of inactive queue scans elapsed
 1590          * since the last laundering to determine whether to launder again, so
 1591          * keep count.
 1592          */
 1593         if (starting_page_shortage > 0) {
 1594                 pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 1595                 vm_pagequeue_lock(pq);
 1596                 if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&
 1597                     (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) {
 1598                         if (page_shortage > 0) {
 1599                                 vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;
 1600                                 VM_CNT_INC(v_pdshortfalls);
 1601                         } else if (vmd->vmd_laundry_request !=
 1602                             VM_LAUNDRY_SHORTFALL)
 1603                                 vmd->vmd_laundry_request =
 1604                                     VM_LAUNDRY_BACKGROUND;
 1605                         wakeup(&vmd->vmd_laundry_request);
 1606                 }
 1607                 vmd->vmd_clean_pages_freed +=
 1608                     starting_page_shortage - page_shortage;
 1609                 vm_pagequeue_unlock(pq);
 1610         }
 1611 
 1612         /*
 1613          * Wakeup the swapout daemon if we didn't free the targeted number of
 1614          * pages.
 1615          */
 1616         if (page_shortage > 0)
 1617                 vm_swapout_run();
 1618 
 1619         /*
 1620          * If the inactive queue scan fails repeatedly to meet its
 1621          * target, kill the largest process.
 1622          */
 1623         vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
 1624 
 1625         /*
 1626          * Reclaim pages by swapping out idle processes, if configured to do so.
 1627          */
 1628         vm_swapout_run_idle();
 1629 
 1630         /*
 1631          * See the description of addl_page_shortage above.
 1632          */
 1633         *addl_shortage = addl_page_shortage + deficit;
 1634 
 1635         return (page_shortage <= 0);
 1636 }
 1637 
 1638 static int vm_pageout_oom_vote;
 1639 
 1640 /*
 1641  * The pagedaemon threads randlomly select one to perform the
 1642  * OOM.  Trying to kill processes before all pagedaemons
 1643  * failed to reach free target is premature.
 1644  */
 1645 static void
 1646 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
 1647     int starting_page_shortage)
 1648 {
 1649         int old_vote;
 1650 
 1651         if (starting_page_shortage <= 0 || starting_page_shortage !=
 1652             page_shortage)
 1653                 vmd->vmd_oom_seq = 0;
 1654         else
 1655                 vmd->vmd_oom_seq++;
 1656         if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 1657                 if (vmd->vmd_oom) {
 1658                         vmd->vmd_oom = FALSE;
 1659                         atomic_subtract_int(&vm_pageout_oom_vote, 1);
 1660                 }
 1661                 return;
 1662         }
 1663 
 1664         /*
 1665          * Do not follow the call sequence until OOM condition is
 1666          * cleared.
 1667          */
 1668         vmd->vmd_oom_seq = 0;
 1669 
 1670         if (vmd->vmd_oom)
 1671                 return;
 1672 
 1673         vmd->vmd_oom = TRUE;
 1674         old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
 1675         if (old_vote != vm_ndomains - 1)
 1676                 return;
 1677 
 1678         /*
 1679          * The current pagedaemon thread is the last in the quorum to
 1680          * start OOM.  Initiate the selection and signaling of the
 1681          * victim.
 1682          */
 1683         vm_pageout_oom(VM_OOM_MEM);
 1684 
 1685         /*
 1686          * After one round of OOM terror, recall our vote.  On the
 1687          * next pass, current pagedaemon would vote again if the low
 1688          * memory condition is still there, due to vmd_oom being
 1689          * false.
 1690          */
 1691         vmd->vmd_oom = FALSE;
 1692         atomic_subtract_int(&vm_pageout_oom_vote, 1);
 1693 }
 1694 
 1695 /*
 1696  * The OOM killer is the page daemon's action of last resort when
 1697  * memory allocation requests have been stalled for a prolonged period
 1698  * of time because it cannot reclaim memory.  This function computes
 1699  * the approximate number of physical pages that could be reclaimed if
 1700  * the specified address space is destroyed.
 1701  *
 1702  * Private, anonymous memory owned by the address space is the
 1703  * principal resource that we expect to recover after an OOM kill.
 1704  * Since the physical pages mapped by the address space's COW entries
 1705  * are typically shared pages, they are unlikely to be released and so
 1706  * they are not counted.
 1707  *
 1708  * To get to the point where the page daemon runs the OOM killer, its
 1709  * efforts to write-back vnode-backed pages may have stalled.  This
 1710  * could be caused by a memory allocation deadlock in the write path
 1711  * that might be resolved by an OOM kill.  Therefore, physical pages
 1712  * belonging to vnode-backed objects are counted, because they might
 1713  * be freed without being written out first if the address space holds
 1714  * the last reference to an unlinked vnode.
 1715  *
 1716  * Similarly, physical pages belonging to OBJT_PHYS objects are
 1717  * counted because the address space might hold the last reference to
 1718  * the object.
 1719  */
 1720 static long
 1721 vm_pageout_oom_pagecount(struct vmspace *vmspace)
 1722 {
 1723         vm_map_t map;
 1724         vm_map_entry_t entry;
 1725         vm_object_t obj;
 1726         long res;
 1727 
 1728         map = &vmspace->vm_map;
 1729         KASSERT(!map->system_map, ("system map"));
 1730         sx_assert(&map->lock, SA_LOCKED);
 1731         res = 0;
 1732         for (entry = map->header.next; entry != &map->header;
 1733             entry = entry->next) {
 1734                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 1735                         continue;
 1736                 obj = entry->object.vm_object;
 1737                 if (obj == NULL)
 1738                         continue;
 1739                 if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
 1740                     obj->ref_count != 1)
 1741                         continue;
 1742                 switch (obj->type) {
 1743                 case OBJT_DEFAULT:
 1744                 case OBJT_SWAP:
 1745                 case OBJT_PHYS:
 1746                 case OBJT_VNODE:
 1747                         res += obj->resident_page_count;
 1748                         break;
 1749                 }
 1750         }
 1751         return (res);
 1752 }
 1753 
 1754 void
 1755 vm_pageout_oom(int shortage)
 1756 {
 1757         struct proc *p, *bigproc;
 1758         vm_offset_t size, bigsize;
 1759         struct thread *td;
 1760         struct vmspace *vm;
 1761         bool breakout;
 1762 
 1763         /*
 1764          * We keep the process bigproc locked once we find it to keep anyone
 1765          * from messing with it; however, there is a possibility of
 1766          * deadlock if process B is bigproc and one of its child processes
 1767          * attempts to propagate a signal to B while we are waiting for A's
 1768          * lock while walking this list.  To avoid this, we don't block on
 1769          * the process lock but just skip a process if it is already locked.
 1770          */
 1771         bigproc = NULL;
 1772         bigsize = 0;
 1773         sx_slock(&allproc_lock);
 1774         FOREACH_PROC_IN_SYSTEM(p) {
 1775                 PROC_LOCK(p);
 1776 
 1777                 /*
 1778                  * If this is a system, protected or killed process, skip it.
 1779                  */
 1780                 if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 1781                     P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
 1782                     p->p_pid == 1 || P_KILLED(p) ||
 1783                     (p->p_pid < 48 && swap_pager_avail != 0)) {
 1784                         PROC_UNLOCK(p);
 1785                         continue;
 1786                 }
 1787                 /*
 1788                  * If the process is in a non-running type state,
 1789                  * don't touch it.  Check all the threads individually.
 1790                  */
 1791                 breakout = false;
 1792                 FOREACH_THREAD_IN_PROC(p, td) {
 1793                         thread_lock(td);
 1794                         if (!TD_ON_RUNQ(td) &&
 1795                             !TD_IS_RUNNING(td) &&
 1796                             !TD_IS_SLEEPING(td) &&
 1797                             !TD_IS_SUSPENDED(td) &&
 1798                             !TD_IS_SWAPPED(td)) {
 1799                                 thread_unlock(td);
 1800                                 breakout = true;
 1801                                 break;
 1802                         }
 1803                         thread_unlock(td);
 1804                 }
 1805                 if (breakout) {
 1806                         PROC_UNLOCK(p);
 1807                         continue;
 1808                 }
 1809                 /*
 1810                  * get the process size
 1811                  */
 1812                 vm = vmspace_acquire_ref(p);
 1813                 if (vm == NULL) {
 1814                         PROC_UNLOCK(p);
 1815                         continue;
 1816                 }
 1817                 _PHOLD_LITE(p);
 1818                 PROC_UNLOCK(p);
 1819                 sx_sunlock(&allproc_lock);
 1820                 if (!vm_map_trylock_read(&vm->vm_map)) {
 1821                         vmspace_free(vm);
 1822                         sx_slock(&allproc_lock);
 1823                         PRELE(p);
 1824                         continue;
 1825                 }
 1826                 size = vmspace_swap_count(vm);
 1827                 if (shortage == VM_OOM_MEM)
 1828                         size += vm_pageout_oom_pagecount(vm);
 1829                 vm_map_unlock_read(&vm->vm_map);
 1830                 vmspace_free(vm);
 1831                 sx_slock(&allproc_lock);
 1832 
 1833                 /*
 1834                  * If this process is bigger than the biggest one,
 1835                  * remember it.
 1836                  */
 1837                 if (size > bigsize) {
 1838                         if (bigproc != NULL)
 1839                                 PRELE(bigproc);
 1840                         bigproc = p;
 1841                         bigsize = size;
 1842                 } else {
 1843                         PRELE(p);
 1844                 }
 1845         }
 1846         sx_sunlock(&allproc_lock);
 1847         if (bigproc != NULL) {
 1848                 if (vm_panic_on_oom != 0)
 1849                         panic("out of swap space");
 1850                 PROC_LOCK(bigproc);
 1851                 killproc(bigproc, "out of swap space");
 1852                 sched_nice(bigproc, PRIO_MIN);
 1853                 _PRELE(bigproc);
 1854                 PROC_UNLOCK(bigproc);
 1855         }
 1856 }
 1857 
 1858 static bool
 1859 vm_pageout_lowmem(void)
 1860 {
 1861         static int lowmem_ticks = 0;
 1862         int last;
 1863 
 1864         last = atomic_load_int(&lowmem_ticks);
 1865         while ((u_int)(ticks - last) / hz >= lowmem_period) {
 1866                 if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0)
 1867                         continue;
 1868 
 1869                 /*
 1870                  * Decrease registered cache sizes.
 1871                  */
 1872                 SDT_PROBE0(vm, , , vm__lowmem_scan);
 1873                 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
 1874 
 1875                 /*
 1876                  * We do this explicitly after the caches have been
 1877                  * drained above.
 1878                  */
 1879                 uma_reclaim();
 1880                 return (true);
 1881         }
 1882         return (false);
 1883 }
 1884 
 1885 static void
 1886 vm_pageout_worker(void *arg)
 1887 {
 1888         struct vm_domain *vmd;
 1889         u_int ofree;
 1890         int addl_shortage, domain, shortage;
 1891         bool target_met;
 1892 
 1893         domain = (uintptr_t)arg;
 1894         vmd = VM_DOMAIN(domain);
 1895         shortage = 0;
 1896         target_met = true;
 1897 
 1898         /*
 1899          * XXXKIB It could be useful to bind pageout daemon threads to
 1900          * the cores belonging to the domain, from which vm_page_array
 1901          * is allocated.
 1902          */
 1903 
 1904         KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 1905         vmd->vmd_last_active_scan = ticks;
 1906 
 1907         /*
 1908          * The pageout daemon worker is never done, so loop forever.
 1909          */
 1910         while (TRUE) {
 1911                 vm_domain_pageout_lock(vmd);
 1912 
 1913                 /*
 1914                  * We need to clear wanted before we check the limits.  This
 1915                  * prevents races with wakers who will check wanted after they
 1916                  * reach the limit.
 1917                  */
 1918                 atomic_store_int(&vmd->vmd_pageout_wanted, 0);
 1919 
 1920                 /*
 1921                  * Might the page daemon need to run again?
 1922                  */
 1923                 if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 1924                         /*
 1925                          * Yes.  If the scan failed to produce enough free
 1926                          * pages, sleep uninterruptibly for some time in the
 1927                          * hope that the laundry thread will clean some pages.
 1928                          */
 1929                         vm_domain_pageout_unlock(vmd);
 1930                         if (!target_met)
 1931                                 pause("pwait", hz / VM_INACT_SCAN_RATE);
 1932                 } else {
 1933                         /*
 1934                          * No, sleep until the next wakeup or until pages
 1935                          * need to have their reference stats updated.
 1936                          */
 1937                         if (mtx_sleep(&vmd->vmd_pageout_wanted,
 1938                             vm_domain_pageout_lockptr(vmd), PDROP | PVM,
 1939                             "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 1940                                 VM_CNT_INC(v_pdwakeups);
 1941                 }
 1942 
 1943                 /* Prevent spurious wakeups by ensuring that wanted is set. */
 1944                 atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 1945 
 1946                 /*
 1947                  * Use the controller to calculate how many pages to free in
 1948                  * this interval, and scan the inactive queue.  If the lowmem
 1949                  * handlers appear to have freed up some pages, subtract the
 1950                  * difference from the inactive queue scan target.
 1951                  */
 1952                 shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
 1953                 if (shortage > 0) {
 1954                         ofree = vmd->vmd_free_count;
 1955                         if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree)
 1956                                 shortage -= min(vmd->vmd_free_count - ofree,
 1957                                     (u_int)shortage);
 1958                         target_met = vm_pageout_scan_inactive(vmd, shortage,
 1959                             &addl_shortage);
 1960                 } else
 1961                         addl_shortage = 0;
 1962 
 1963                 /*
 1964                  * Scan the active queue.  A positive value for shortage
 1965                  * indicates that we must aggressively deactivate pages to avoid
 1966                  * a shortfall.
 1967                  */
 1968                 shortage = vm_pageout_active_target(vmd) + addl_shortage;
 1969                 vm_pageout_scan_active(vmd, shortage);
 1970         }
 1971 }
 1972 
 1973 /*
 1974  *      vm_pageout_init initialises basic pageout daemon settings.
 1975  */
 1976 static void
 1977 vm_pageout_init_domain(int domain)
 1978 {
 1979         struct vm_domain *vmd;
 1980         struct sysctl_oid *oid;
 1981 
 1982         vmd = VM_DOMAIN(domain);
 1983         vmd->vmd_interrupt_free_min = 2;
 1984 
 1985         /*
 1986          * v_free_reserved needs to include enough for the largest
 1987          * swap pager structures plus enough for any pv_entry structs
 1988          * when paging. 
 1989          */
 1990         if (vmd->vmd_page_count > 1024)
 1991                 vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200;
 1992         else
 1993                 vmd->vmd_free_min = 4;
 1994         vmd->vmd_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 1995             vmd->vmd_interrupt_free_min;
 1996         vmd->vmd_free_reserved = vm_pageout_page_count +
 1997             vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768);
 1998         vmd->vmd_free_severe = vmd->vmd_free_min / 2;
 1999         vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;
 2000         vmd->vmd_free_min += vmd->vmd_free_reserved;
 2001         vmd->vmd_free_severe += vmd->vmd_free_reserved;
 2002         vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;
 2003         if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)
 2004                 vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 2005 
 2006         /*
 2007          * Set the default wakeup threshold to be 10% below the paging
 2008          * target.  This keeps the steady state out of shortfall.
 2009          */
 2010         vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 2011 
 2012         /*
 2013          * Target amount of memory to move out of the laundry queue during a
 2014          * background laundering.  This is proportional to the amount of system
 2015          * memory.
 2016          */
 2017         vmd->vmd_background_launder_target = (vmd->vmd_free_target -
 2018             vmd->vmd_free_min) / 10;
 2019 
 2020         /* Initialize the pageout daemon pid controller. */
 2021         pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
 2022             vmd->vmd_free_target, PIDCTRL_BOUND,
 2023             PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
 2024         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
 2025             "pidctrl", CTLFLAG_RD, NULL, "");
 2026         pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 2027 }
 2028 
 2029 static void
 2030 vm_pageout_init(void)
 2031 {
 2032         u_int freecount;
 2033         int i;
 2034 
 2035         /*
 2036          * Initialize some paging parameters.
 2037          */
 2038         if (vm_cnt.v_page_count < 2000)
 2039                 vm_pageout_page_count = 8;
 2040 
 2041         freecount = 0;
 2042         for (i = 0; i < vm_ndomains; i++) {
 2043                 struct vm_domain *vmd;
 2044 
 2045                 vm_pageout_init_domain(i);
 2046                 vmd = VM_DOMAIN(i);
 2047                 vm_cnt.v_free_reserved += vmd->vmd_free_reserved;
 2048                 vm_cnt.v_free_target += vmd->vmd_free_target;
 2049                 vm_cnt.v_free_min += vmd->vmd_free_min;
 2050                 vm_cnt.v_inactive_target += vmd->vmd_inactive_target;
 2051                 vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;
 2052                 vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;
 2053                 vm_cnt.v_free_severe += vmd->vmd_free_severe;
 2054                 freecount += vmd->vmd_free_count;
 2055         }
 2056 
 2057         /*
 2058          * Set interval in seconds for active scan.  We want to visit each
 2059          * page at least once every ten minutes.  This is to prevent worst
 2060          * case paging behaviors with stale active LRU.
 2061          */
 2062         if (vm_pageout_update_period == 0)
 2063                 vm_pageout_update_period = 600;
 2064 
 2065         if (vm_page_max_wired == 0)
 2066                 vm_page_max_wired = freecount / 3;
 2067 }
 2068 
 2069 /*
 2070  *     vm_pageout is the high level pageout daemon.
 2071  */
 2072 static void
 2073 vm_pageout(void)
 2074 {
 2075         struct proc *p;
 2076         struct thread *td;
 2077         int error, first, i;
 2078 
 2079         p = curproc;
 2080         td = curthread;
 2081 
 2082         swap_pager_swap_init();
 2083         for (first = -1, i = 0; i < vm_ndomains; i++) {
 2084                 if (VM_DOMAIN_EMPTY(i)) {
 2085                         if (bootverbose)
 2086                                 printf("domain %d empty; skipping pageout\n",
 2087                                     i);
 2088                         continue;
 2089                 }
 2090                 if (first == -1)
 2091                         first = i;
 2092                 else {
 2093                         error = kthread_add(vm_pageout_worker,
 2094                             (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i);
 2095                         if (error != 0)
 2096                                 panic("starting pageout for domain %d: %d\n",
 2097                                     i, error);
 2098                 }
 2099                 error = kthread_add(vm_pageout_laundry_worker,
 2100                     (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i);
 2101                 if (error != 0)
 2102                         panic("starting laundry for domain %d: %d", i, error);
 2103         }
 2104         error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma");
 2105         if (error != 0)
 2106                 panic("starting uma_reclaim helper, error %d\n", error);
 2107 
 2108         snprintf(td->td_name, sizeof(td->td_name), "dom%d", first);
 2109         vm_pageout_worker((void *)(uintptr_t)first);
 2110 }
 2111 
 2112 /*
 2113  * Perform an advisory wakeup of the page daemon.
 2114  */
 2115 void
 2116 pagedaemon_wakeup(int domain)
 2117 {
 2118         struct vm_domain *vmd;
 2119 
 2120         vmd = VM_DOMAIN(domain);
 2121         vm_domain_pageout_assert_unlocked(vmd);
 2122         if (curproc == pageproc)
 2123                 return;
 2124 
 2125         if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {
 2126                 vm_domain_pageout_lock(vmd);
 2127                 atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 2128                 wakeup(&vmd->vmd_pageout_wanted);
 2129                 vm_domain_pageout_unlock(vmd);
 2130         }
 2131 }
Cache object: dee445228c66587edc6bcf7e0f8c358a
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_pageout.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_pageout.c