The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_pageout.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
    3  *
    4  * Copyright (c) 1991 Regents of the University of California.
    5  * All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  * Copyright (c) 2005 Yahoo! Technologies Norway AS
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * The Mach Operating System project at Carnegie-Mellon University.
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  * 1. Redistributions of source code must retain the above copyright
   20  *    notice, this list of conditions and the following disclaimer.
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  * 3. All advertising materials mentioning features or use of this software
   25  *    must display the following acknowledgement:
   26  *      This product includes software developed by the University of
   27  *      California, Berkeley and its contributors.
   28  * 4. Neither the name of the University nor the names of its contributors
   29  *    may be used to endorse or promote products derived from this software
   30  *    without specific prior written permission.
   31  *
   32  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   42  * SUCH DAMAGE.
   43  *
   44  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
   45  *
   46  *
   47  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   48  * All rights reserved.
   49  *
   50  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   51  *
   52  * Permission to use, copy, modify and distribute this software and
   53  * its documentation is hereby granted, provided that both the copyright
   54  * notice and this permission notice appear in all copies of the
   55  * software, derivative works or modified versions, and any portions
   56  * thereof, and that both notices appear in supporting documentation.
   57  *
   58  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   59  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   60  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   61  *
   62  * Carnegie Mellon requests users of this software to return to
   63  *
   64  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   65  *  School of Computer Science
   66  *  Carnegie Mellon University
   67  *  Pittsburgh PA 15213-3890
   68  *
   69  * any improvements or extensions that they make and grant Carnegie the
   70  * rights to redistribute these changes.
   71  */
   72 
   73 /*
   74  *      The proverbial page-out daemon.
   75  */
   76 
   77 #include <sys/cdefs.h>
   78 __FBSDID("$FreeBSD$");
   79 
   80 #include "opt_vm.h"
   81 
   82 #include <sys/param.h>
   83 #include <sys/systm.h>
   84 #include <sys/kernel.h>
   85 #include <sys/eventhandler.h>
   86 #include <sys/lock.h>
   87 #include <sys/mutex.h>
   88 #include <sys/proc.h>
   89 #include <sys/kthread.h>
   90 #include <sys/ktr.h>
   91 #include <sys/mount.h>
   92 #include <sys/racct.h>
   93 #include <sys/resourcevar.h>
   94 #include <sys/sched.h>
   95 #include <sys/sdt.h>
   96 #include <sys/signalvar.h>
   97 #include <sys/smp.h>
   98 #include <sys/time.h>
   99 #include <sys/vnode.h>
  100 #include <sys/vmmeter.h>
  101 #include <sys/rwlock.h>
  102 #include <sys/sx.h>
  103 #include <sys/sysctl.h>
  104 
  105 #include <vm/vm.h>
  106 #include <vm/vm_param.h>
  107 #include <vm/vm_object.h>
  108 #include <vm/vm_page.h>
  109 #include <vm/vm_map.h>
  110 #include <vm/vm_pageout.h>
  111 #include <vm/vm_pager.h>
  112 #include <vm/vm_phys.h>
  113 #include <vm/vm_pagequeue.h>
  114 #include <vm/swap_pager.h>
  115 #include <vm/vm_extern.h>
  116 #include <vm/uma.h>
  117 
  118 /*
  119  * System initialization
  120  */
  121 
  122 /* the kernel process "vm_pageout"*/
  123 static void vm_pageout(void);
  124 static void vm_pageout_init(void);
  125 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
  126 static int vm_pageout_cluster(vm_page_t m);
  127 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
  128     int starting_page_shortage);
  129 
  130 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
  131     NULL);
  132 
  133 struct proc *pageproc;
  134 
  135 static struct kproc_desc page_kp = {
  136         "pagedaemon",
  137         vm_pageout,
  138         &pageproc
  139 };
  140 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
  141     &page_kp);
  142 
  143 SDT_PROVIDER_DEFINE(vm);
  144 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
  145 
  146 /* Pagedaemon activity rates, in subdivisions of one second. */
  147 #define VM_LAUNDER_RATE         10
  148 #define VM_INACT_SCAN_RATE      10
  149 
  150 static int vm_pageout_oom_seq = 12;
  151 
  152 static int vm_pageout_update_period;
  153 static int disable_swap_pageouts;
  154 static int lowmem_period = 10;
  155 static int swapdev_enabled;
  156 
  157 static int vm_panic_on_oom = 0;
  158 
  159 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
  160         CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
  161         "panic on out of memory instead of killing the largest process");
  162 
  163 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
  164         CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
  165         "Maximum active LRU update period");
  166   
  167 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
  168         "Low memory callback period");
  169 
  170 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
  171         CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
  172 
  173 static int pageout_lock_miss;
  174 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
  175         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
  176 
  177 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
  178         CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
  179         "back-to-back calls to oom detector to start OOM");
  180 
  181 static int act_scan_laundry_weight = 3;
  182 SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
  183     &act_scan_laundry_weight, 0,
  184     "weight given to clean vs. dirty pages in active queue scans");
  185 
  186 static u_int vm_background_launder_rate = 4096;
  187 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
  188     &vm_background_launder_rate, 0,
  189     "background laundering rate, in kilobytes per second");
  190 
  191 static u_int vm_background_launder_max = 20 * 1024;
  192 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
  193     &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
  194 
  195 int vm_pageout_page_count = 32;
  196 
  197 u_int vm_page_max_user_wired;
  198 SYSCTL_UINT(_vm, OID_AUTO, max_wired, CTLFLAG_RW,
  199     &vm_page_max_user_wired, 0,
  200     "system-wide limit to user-wired page count");
  201 
  202 static u_int isqrt(u_int num);
  203 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
  204     bool in_shortfall);
  205 static void vm_pageout_laundry_worker(void *arg);
  206 
  207 struct scan_state {
  208         struct vm_batchqueue bq;
  209         struct vm_pagequeue *pq;
  210         vm_page_t       marker;
  211         int             maxscan;
  212         int             scanned;
  213 };
  214 
  215 static void
  216 vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
  217     vm_page_t marker, vm_page_t after, int maxscan)
  218 {
  219 
  220         vm_pagequeue_assert_locked(pq);
  221         KASSERT((marker->aflags & PGA_ENQUEUED) == 0,
  222             ("marker %p already enqueued", marker));
  223 
  224         if (after == NULL)
  225                 TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
  226         else
  227                 TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
  228         vm_page_aflag_set(marker, PGA_ENQUEUED);
  229 
  230         vm_batchqueue_init(&ss->bq);
  231         ss->pq = pq;
  232         ss->marker = marker;
  233         ss->maxscan = maxscan;
  234         ss->scanned = 0;
  235         vm_pagequeue_unlock(pq);
  236 }
  237 
  238 static void
  239 vm_pageout_end_scan(struct scan_state *ss)
  240 {
  241         struct vm_pagequeue *pq;
  242 
  243         pq = ss->pq;
  244         vm_pagequeue_assert_locked(pq);
  245         KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
  246             ("marker %p not enqueued", ss->marker));
  247 
  248         TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
  249         vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
  250         pq->pq_pdpages += ss->scanned;
  251 }
  252 
  253 /*
  254  * Add a small number of queued pages to a batch queue for later processing
  255  * without the corresponding queue lock held.  The caller must have enqueued a
  256  * marker page at the desired start point for the scan.  Pages will be
  257  * physically dequeued if the caller so requests.  Otherwise, the returned
  258  * batch may contain marker pages, and it is up to the caller to handle them.
  259  *
  260  * When processing the batch queue, vm_page_queue() must be used to
  261  * determine whether the page has been logically dequeued by another thread.
  262  * Once this check is performed, the page lock guarantees that the page will
  263  * not be disassociated from the queue.
  264  */
  265 static __always_inline void
  266 vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
  267 {
  268         struct vm_pagequeue *pq;
  269         vm_page_t m, marker, n;
  270 
  271         marker = ss->marker;
  272         pq = ss->pq;
  273 
  274         KASSERT((marker->aflags & PGA_ENQUEUED) != 0,
  275             ("marker %p not enqueued", ss->marker));
  276 
  277         vm_pagequeue_lock(pq);
  278         for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
  279             ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
  280             m = n, ss->scanned++) {
  281                 n = TAILQ_NEXT(m, plinks.q);
  282                 if ((m->flags & PG_MARKER) == 0) {
  283                         KASSERT((m->aflags & PGA_ENQUEUED) != 0,
  284                             ("page %p not enqueued", m));
  285                         KASSERT((m->flags & PG_FICTITIOUS) == 0,
  286                             ("Fictitious page %p cannot be in page queue", m));
  287                         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  288                             ("Unmanaged page %p cannot be in page queue", m));
  289                 } else if (dequeue)
  290                         continue;
  291 
  292                 (void)vm_batchqueue_insert(&ss->bq, m);
  293                 if (dequeue) {
  294                         TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
  295                         vm_page_aflag_clear(m, PGA_ENQUEUED);
  296                 }
  297         }
  298         TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
  299         if (__predict_true(m != NULL))
  300                 TAILQ_INSERT_BEFORE(m, marker, plinks.q);
  301         else
  302                 TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
  303         if (dequeue)
  304                 vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
  305         vm_pagequeue_unlock(pq);
  306 }
  307 
  308 /* Return the next page to be scanned, or NULL if the scan is complete. */
  309 static __always_inline vm_page_t
  310 vm_pageout_next(struct scan_state *ss, const bool dequeue)
  311 {
  312 
  313         if (ss->bq.bq_cnt == 0)
  314                 vm_pageout_collect_batch(ss, dequeue);
  315         return (vm_batchqueue_pop(&ss->bq));
  316 }
  317 
  318 /*
  319  * Scan for pages at adjacent offsets within the given page's object that are
  320  * eligible for laundering, form a cluster of these pages and the given page,
  321  * and launder that cluster.
  322  */
  323 static int
  324 vm_pageout_cluster(vm_page_t m)
  325 {
  326         vm_object_t object;
  327         vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
  328         vm_pindex_t pindex;
  329         int ib, is, page_base, pageout_count;
  330 
  331         vm_page_assert_locked(m);
  332         object = m->object;
  333         VM_OBJECT_ASSERT_WLOCKED(object);
  334         pindex = m->pindex;
  335 
  336         vm_page_assert_unbusied(m);
  337         KASSERT(!vm_page_held(m), ("page %p is held", m));
  338 
  339         pmap_remove_write(m);
  340         vm_page_unlock(m);
  341 
  342         mc[vm_pageout_page_count] = pb = ps = m;
  343         pageout_count = 1;
  344         page_base = vm_pageout_page_count;
  345         ib = 1;
  346         is = 1;
  347 
  348         /*
  349          * We can cluster only if the page is not clean, busy, or held, and
  350          * the page is in the laundry queue.
  351          *
  352          * During heavy mmap/modification loads the pageout
  353          * daemon can really fragment the underlying file
  354          * due to flushing pages out of order and not trying to
  355          * align the clusters (which leaves sporadic out-of-order
  356          * holes).  To solve this problem we do the reverse scan
  357          * first and attempt to align our cluster, then do a 
  358          * forward scan if room remains.
  359          */
  360 more:
  361         while (ib != 0 && pageout_count < vm_pageout_page_count) {
  362                 if (ib > pindex) {
  363                         ib = 0;
  364                         break;
  365                 }
  366                 if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
  367                         ib = 0;
  368                         break;
  369                 }
  370                 vm_page_test_dirty(p);
  371                 if (p->dirty == 0) {
  372                         ib = 0;
  373                         break;
  374                 }
  375                 vm_page_lock(p);
  376                 if (vm_page_held(p) || !vm_page_in_laundry(p)) {
  377                         vm_page_unlock(p);
  378                         ib = 0;
  379                         break;
  380                 }
  381                 pmap_remove_write(p);
  382                 vm_page_unlock(p);
  383                 mc[--page_base] = pb = p;
  384                 ++pageout_count;
  385                 ++ib;
  386 
  387                 /*
  388                  * We are at an alignment boundary.  Stop here, and switch
  389                  * directions.  Do not clear ib.
  390                  */
  391                 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
  392                         break;
  393         }
  394         while (pageout_count < vm_pageout_page_count && 
  395             pindex + is < object->size) {
  396                 if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
  397                         break;
  398                 vm_page_test_dirty(p);
  399                 if (p->dirty == 0)
  400                         break;
  401                 vm_page_lock(p);
  402                 if (vm_page_held(p) || !vm_page_in_laundry(p)) {
  403                         vm_page_unlock(p);
  404                         break;
  405                 }
  406                 pmap_remove_write(p);
  407                 vm_page_unlock(p);
  408                 mc[page_base + pageout_count] = ps = p;
  409                 ++pageout_count;
  410                 ++is;
  411         }
  412 
  413         /*
  414          * If we exhausted our forward scan, continue with the reverse scan
  415          * when possible, even past an alignment boundary.  This catches
  416          * boundary conditions.
  417          */
  418         if (ib != 0 && pageout_count < vm_pageout_page_count)
  419                 goto more;
  420 
  421         return (vm_pageout_flush(&mc[page_base], pageout_count,
  422             VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
  423 }
  424 
  425 /*
  426  * vm_pageout_flush() - launder the given pages
  427  *
  428  *      The given pages are laundered.  Note that we setup for the start of
  429  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
  430  *      reference count all in here rather then in the parent.  If we want
  431  *      the parent to do more sophisticated things we may have to change
  432  *      the ordering.
  433  *
  434  *      Returned runlen is the count of pages between mreq and first
  435  *      page after mreq with status VM_PAGER_AGAIN.
  436  *      *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
  437  *      for any page in runlen set.
  438  */
  439 int
  440 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
  441     boolean_t *eio)
  442 {
  443         vm_object_t object = mc[0]->object;
  444         int pageout_status[count];
  445         int numpagedout = 0;
  446         int i, runlen;
  447 
  448         VM_OBJECT_ASSERT_WLOCKED(object);
  449 
  450         /*
  451          * Initiate I/O.  Mark the pages busy and verify that they're valid
  452          * and read-only.
  453          *
  454          * We do not have to fixup the clean/dirty bits here... we can
  455          * allow the pager to do it after the I/O completes.
  456          *
  457          * NOTE! mc[i]->dirty may be partial or fragmented due to an
  458          * edge case with file fragments.
  459          */
  460         for (i = 0; i < count; i++) {
  461                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
  462                     ("vm_pageout_flush: partially invalid page %p index %d/%d",
  463                         mc[i], i, count));
  464                 KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
  465                     ("vm_pageout_flush: writeable page %p", mc[i]));
  466                 vm_page_sbusy(mc[i]);
  467         }
  468         vm_object_pip_add(object, count);
  469 
  470         vm_pager_put_pages(object, mc, count, flags, pageout_status);
  471 
  472         runlen = count - mreq;
  473         if (eio != NULL)
  474                 *eio = FALSE;
  475         for (i = 0; i < count; i++) {
  476                 vm_page_t mt = mc[i];
  477 
  478                 KASSERT(pageout_status[i] == VM_PAGER_PEND ||
  479                     !pmap_page_is_write_mapped(mt),
  480                     ("vm_pageout_flush: page %p is not write protected", mt));
  481                 switch (pageout_status[i]) {
  482                 case VM_PAGER_OK:
  483                         vm_page_lock(mt);
  484                         if (vm_page_in_laundry(mt))
  485                                 vm_page_deactivate_noreuse(mt);
  486                         vm_page_unlock(mt);
  487                         /* FALLTHROUGH */
  488                 case VM_PAGER_PEND:
  489                         numpagedout++;
  490                         break;
  491                 case VM_PAGER_BAD:
  492                         /*
  493                          * The page is outside the object's range.  We pretend
  494                          * that the page out worked and clean the page, so the
  495                          * changes will be lost if the page is reclaimed by
  496                          * the page daemon.
  497                          */
  498                         vm_page_undirty(mt);
  499                         vm_page_lock(mt);
  500                         if (vm_page_in_laundry(mt))
  501                                 vm_page_deactivate_noreuse(mt);
  502                         vm_page_unlock(mt);
  503                         break;
  504                 case VM_PAGER_ERROR:
  505                 case VM_PAGER_FAIL:
  506                         /*
  507                          * If the page couldn't be paged out to swap because the
  508                          * pager wasn't able to find space, place the page in
  509                          * the PQ_UNSWAPPABLE holding queue.  This is an
  510                          * optimization that prevents the page daemon from
  511                          * wasting CPU cycles on pages that cannot be reclaimed
  512                          * because no swap device is configured.
  513                          *
  514                          * Otherwise, reactivate the page so that it doesn't
  515                          * clog the laundry and inactive queues.  (We will try
  516                          * paging it out again later.)
  517                          */
  518                         vm_page_lock(mt);
  519                         if (object->type == OBJT_SWAP &&
  520                             pageout_status[i] == VM_PAGER_FAIL) {
  521                                 vm_page_unswappable(mt);
  522                                 numpagedout++;
  523                         } else
  524                                 vm_page_activate(mt);
  525                         vm_page_unlock(mt);
  526                         if (eio != NULL && i >= mreq && i - mreq < runlen)
  527                                 *eio = TRUE;
  528                         break;
  529                 case VM_PAGER_AGAIN:
  530                         if (i >= mreq && i - mreq < runlen)
  531                                 runlen = i - mreq;
  532                         break;
  533                 }
  534 
  535                 /*
  536                  * If the operation is still going, leave the page busy to
  537                  * block all other accesses. Also, leave the paging in
  538                  * progress indicator set so that we don't attempt an object
  539                  * collapse.
  540                  */
  541                 if (pageout_status[i] != VM_PAGER_PEND) {
  542                         vm_object_pip_wakeup(object);
  543                         vm_page_sunbusy(mt);
  544                 }
  545         }
  546         if (prunlen != NULL)
  547                 *prunlen = runlen;
  548         return (numpagedout);
  549 }
  550 
  551 static void
  552 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
  553 {
  554 
  555         atomic_store_rel_int(&swapdev_enabled, 1);
  556 }
  557 
  558 static void
  559 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
  560 {
  561 
  562         if (swap_pager_nswapdev() == 1)
  563                 atomic_store_rel_int(&swapdev_enabled, 0);
  564 }
  565 
  566 /*
  567  * Attempt to acquire all of the necessary locks to launder a page and
  568  * then call through the clustering layer to PUTPAGES.  Wait a short
  569  * time for a vnode lock.
  570  *
  571  * Requires the page and object lock on entry, releases both before return.
  572  * Returns 0 on success and an errno otherwise.
  573  */
  574 static int
  575 vm_pageout_clean(vm_page_t m, int *numpagedout)
  576 {
  577         struct vnode *vp;
  578         struct mount *mp;
  579         vm_object_t object;
  580         vm_pindex_t pindex;
  581         int error, lockmode;
  582 
  583         vm_page_assert_locked(m);
  584         object = m->object;
  585         VM_OBJECT_ASSERT_WLOCKED(object);
  586         error = 0;
  587         vp = NULL;
  588         mp = NULL;
  589 
  590         /*
  591          * The object is already known NOT to be dead.   It
  592          * is possible for the vget() to block the whole
  593          * pageout daemon, but the new low-memory handling
  594          * code should prevent it.
  595          *
  596          * We can't wait forever for the vnode lock, we might
  597          * deadlock due to a vn_read() getting stuck in
  598          * vm_wait while holding this vnode.  We skip the 
  599          * vnode if we can't get it in a reasonable amount
  600          * of time.
  601          */
  602         if (object->type == OBJT_VNODE) {
  603                 vm_page_unlock(m);
  604                 vp = object->handle;
  605                 if (vp->v_type == VREG &&
  606                     vn_start_write(vp, &mp, V_NOWAIT) != 0) {
  607                         mp = NULL;
  608                         error = EDEADLK;
  609                         goto unlock_all;
  610                 }
  611                 KASSERT(mp != NULL,
  612                     ("vp %p with NULL v_mount", vp));
  613                 vm_object_reference_locked(object);
  614                 pindex = m->pindex;
  615                 VM_OBJECT_WUNLOCK(object);
  616                 lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
  617                     LK_SHARED : LK_EXCLUSIVE;
  618                 if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
  619                         vp = NULL;
  620                         error = EDEADLK;
  621                         goto unlock_mp;
  622                 }
  623                 VM_OBJECT_WLOCK(object);
  624 
  625                 /*
  626                  * Ensure that the object and vnode were not disassociated
  627                  * while locks were dropped.
  628                  */
  629                 if (vp->v_object != object) {
  630                         error = ENOENT;
  631                         goto unlock_all;
  632                 }
  633                 vm_page_lock(m);
  634 
  635                 /*
  636                  * While the object and page were unlocked, the page
  637                  * may have been:
  638                  * (1) moved to a different queue,
  639                  * (2) reallocated to a different object,
  640                  * (3) reallocated to a different offset, or
  641                  * (4) cleaned.
  642                  */
  643                 if (!vm_page_in_laundry(m) || m->object != object ||
  644                     m->pindex != pindex || m->dirty == 0) {
  645                         vm_page_unlock(m);
  646                         error = ENXIO;
  647                         goto unlock_all;
  648                 }
  649 
  650                 /*
  651                  * The page may have been busied or referenced while the object
  652                  * and page locks were released.
  653                  */
  654                 if (vm_page_busied(m) || vm_page_held(m)) {
  655                         vm_page_unlock(m);
  656                         error = EBUSY;
  657                         goto unlock_all;
  658                 }
  659         }
  660 
  661         /*
  662          * If a page is dirty, then it is either being washed
  663          * (but not yet cleaned) or it is still in the
  664          * laundry.  If it is still in the laundry, then we
  665          * start the cleaning operation. 
  666          */
  667         if ((*numpagedout = vm_pageout_cluster(m)) == 0)
  668                 error = EIO;
  669 
  670 unlock_all:
  671         VM_OBJECT_WUNLOCK(object);
  672 
  673 unlock_mp:
  674         vm_page_lock_assert(m, MA_NOTOWNED);
  675         if (mp != NULL) {
  676                 if (vp != NULL)
  677                         vput(vp);
  678                 vm_object_deallocate(object);
  679                 vn_finished_write(mp);
  680         }
  681 
  682         return (error);
  683 }
  684 
  685 /*
  686  * Attempt to launder the specified number of pages.
  687  *
  688  * Returns the number of pages successfully laundered.
  689  */
  690 static int
  691 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
  692 {
  693         struct scan_state ss;
  694         struct vm_pagequeue *pq;
  695         struct mtx *mtx;
  696         vm_object_t object;
  697         vm_page_t m, marker;
  698         int act_delta, error, numpagedout, queue, starting_target;
  699         int vnodes_skipped;
  700         bool pageout_ok;
  701 
  702         mtx = NULL;
  703         object = NULL;
  704         starting_target = launder;
  705         vnodes_skipped = 0;
  706 
  707         /*
  708          * Scan the laundry queues for pages eligible to be laundered.  We stop
  709          * once the target number of dirty pages have been laundered, or once
  710          * we've reached the end of the queue.  A single iteration of this loop
  711          * may cause more than one page to be laundered because of clustering.
  712          *
  713          * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
  714          * swap devices are configured.
  715          */
  716         if (atomic_load_acq_int(&swapdev_enabled))
  717                 queue = PQ_UNSWAPPABLE;
  718         else
  719                 queue = PQ_LAUNDRY;
  720 
  721 scan:
  722         marker = &vmd->vmd_markers[queue];
  723         pq = &vmd->vmd_pagequeues[queue];
  724         vm_pagequeue_lock(pq);
  725         vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
  726         while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
  727                 if (__predict_false((m->flags & PG_MARKER) != 0))
  728                         continue;
  729 
  730                 vm_page_change_lock(m, &mtx);
  731 
  732 recheck:
  733                 /*
  734                  * The page may have been disassociated from the queue
  735                  * while locks were dropped.
  736                  */
  737                 if (vm_page_queue(m) != queue)
  738                         continue;
  739 
  740                 /*
  741                  * A requeue was requested, so this page gets a second
  742                  * chance.
  743                  */
  744                 if ((m->aflags & PGA_REQUEUE) != 0) {
  745                         vm_page_requeue(m);
  746                         continue;
  747                 }
  748 
  749                 /*
  750                  * Held pages are essentially stuck in the queue.
  751                  *
  752                  * Wired pages may not be freed.  Complete their removal
  753                  * from the queue now to avoid needless revisits during
  754                  * future scans.
  755                  */
  756                 if (m->hold_count != 0)
  757                         continue;
  758                 if (vm_page_wired(m)) {
  759                         vm_page_dequeue_deferred(m);
  760                         continue;
  761                 }
  762 
  763                 if (object != m->object) {
  764                         if (object != NULL)
  765                                 VM_OBJECT_WUNLOCK(object);
  766                         object = m->object;
  767                         if (!VM_OBJECT_TRYWLOCK(object)) {
  768                                 mtx_unlock(mtx);
  769                                 /* Depends on type-stability. */
  770                                 VM_OBJECT_WLOCK(object);
  771                                 mtx_lock(mtx);
  772                                 goto recheck;
  773                         }
  774                 }
  775 
  776                 if (vm_page_busied(m))
  777                         continue;
  778 
  779                 /*
  780                  * Invalid pages can be easily freed.  They cannot be
  781                  * mapped; vm_page_free() asserts this.
  782                  */
  783                 if (m->valid == 0)
  784                         goto free_page;
  785 
  786                 /*
  787                  * If the page has been referenced and the object is not dead,
  788                  * reactivate or requeue the page depending on whether the
  789                  * object is mapped.
  790                  *
  791                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
  792                  * that a reference from a concurrently destroyed mapping is
  793                  * observed here and now.
  794                  */
  795                 if (object->ref_count != 0)
  796                         act_delta = pmap_ts_referenced(m);
  797                 else {
  798                         KASSERT(!pmap_page_is_mapped(m),
  799                             ("page %p is mapped", m));
  800                         act_delta = 0;
  801                 }
  802                 if ((m->aflags & PGA_REFERENCED) != 0) {
  803                         vm_page_aflag_clear(m, PGA_REFERENCED);
  804                         act_delta++;
  805                 }
  806                 if (act_delta != 0) {
  807                         if (object->ref_count != 0) {
  808                                 VM_CNT_INC(v_reactivated);
  809                                 vm_page_activate(m);
  810 
  811                                 /*
  812                                  * Increase the activation count if the page
  813                                  * was referenced while in the laundry queue.
  814                                  * This makes it less likely that the page will
  815                                  * be returned prematurely to the inactive
  816                                  * queue.
  817                                  */
  818                                 m->act_count += act_delta + ACT_ADVANCE;
  819 
  820                                 /*
  821                                  * If this was a background laundering, count
  822                                  * activated pages towards our target.  The
  823                                  * purpose of background laundering is to ensure
  824                                  * that pages are eventually cycled through the
  825                                  * laundry queue, and an activation is a valid
  826                                  * way out.
  827                                  */
  828                                 if (!in_shortfall)
  829                                         launder--;
  830                                 continue;
  831                         } else if ((object->flags & OBJ_DEAD) == 0) {
  832                                 vm_page_requeue(m);
  833                                 continue;
  834                         }
  835                 }
  836 
  837                 /*
  838                  * If the page appears to be clean at the machine-independent
  839                  * layer, then remove all of its mappings from the pmap in
  840                  * anticipation of freeing it.  If, however, any of the page's
  841                  * mappings allow write access, then the page may still be
  842                  * modified until the last of those mappings are removed.
  843                  */
  844                 if (object->ref_count != 0) {
  845                         vm_page_test_dirty(m);
  846                         if (m->dirty == 0)
  847                                 pmap_remove_all(m);
  848                 }
  849 
  850                 /*
  851                  * Clean pages are freed, and dirty pages are paged out unless
  852                  * they belong to a dead object.  Requeueing dirty pages from
  853                  * dead objects is pointless, as they are being paged out and
  854                  * freed by the thread that destroyed the object.
  855                  */
  856                 if (m->dirty == 0) {
  857 free_page:
  858                         vm_page_free(m);
  859                         VM_CNT_INC(v_dfree);
  860                 } else if ((object->flags & OBJ_DEAD) == 0) {
  861                         if (object->type != OBJT_SWAP &&
  862                             object->type != OBJT_DEFAULT)
  863                                 pageout_ok = true;
  864                         else if (disable_swap_pageouts)
  865                                 pageout_ok = false;
  866                         else
  867                                 pageout_ok = true;
  868                         if (!pageout_ok) {
  869                                 vm_page_requeue(m);
  870                                 continue;
  871                         }
  872 
  873                         /*
  874                          * Form a cluster with adjacent, dirty pages from the
  875                          * same object, and page out that entire cluster.
  876                          *
  877                          * The adjacent, dirty pages must also be in the
  878                          * laundry.  However, their mappings are not checked
  879                          * for new references.  Consequently, a recently
  880                          * referenced page may be paged out.  However, that
  881                          * page will not be prematurely reclaimed.  After page
  882                          * out, the page will be placed in the inactive queue,
  883                          * where any new references will be detected and the
  884                          * page reactivated.
  885                          */
  886                         error = vm_pageout_clean(m, &numpagedout);
  887                         if (error == 0) {
  888                                 launder -= numpagedout;
  889                                 ss.scanned += numpagedout;
  890                         } else if (error == EDEADLK) {
  891                                 pageout_lock_miss++;
  892                                 vnodes_skipped++;
  893                         }
  894                         mtx = NULL;
  895                         object = NULL;
  896                 }
  897         }
  898         if (mtx != NULL) {
  899                 mtx_unlock(mtx);
  900                 mtx = NULL;
  901         }
  902         if (object != NULL) {
  903                 VM_OBJECT_WUNLOCK(object);
  904                 object = NULL;
  905         }
  906         vm_pagequeue_lock(pq);
  907         vm_pageout_end_scan(&ss);
  908         vm_pagequeue_unlock(pq);
  909 
  910         if (launder > 0 && queue == PQ_UNSWAPPABLE) {
  911                 queue = PQ_LAUNDRY;
  912                 goto scan;
  913         }
  914 
  915         /*
  916          * Wakeup the sync daemon if we skipped a vnode in a writeable object
  917          * and we didn't launder enough pages.
  918          */
  919         if (vnodes_skipped > 0 && launder > 0)
  920                 (void)speedup_syncer();
  921 
  922         return (starting_target - launder);
  923 }
  924 
  925 /*
  926  * Compute the integer square root.
  927  */
  928 static u_int
  929 isqrt(u_int num)
  930 {
  931         u_int bit, root, tmp;
  932 
  933         bit = 1u << ((NBBY * sizeof(u_int)) - 2);
  934         while (bit > num)
  935                 bit >>= 2;
  936         root = 0;
  937         while (bit != 0) {
  938                 tmp = root + bit;
  939                 root >>= 1;
  940                 if (num >= tmp) {
  941                         num -= tmp;
  942                         root += bit;
  943                 }
  944                 bit >>= 2;
  945         }
  946         return (root);
  947 }
  948 
  949 /*
  950  * Perform the work of the laundry thread: periodically wake up and determine
  951  * whether any pages need to be laundered.  If so, determine the number of pages
  952  * that need to be laundered, and launder them.
  953  */
  954 static void
  955 vm_pageout_laundry_worker(void *arg)
  956 {
  957         struct vm_domain *vmd;
  958         struct vm_pagequeue *pq;
  959         uint64_t nclean, ndirty, nfreed;
  960         int domain, last_target, launder, shortfall, shortfall_cycle, target;
  961         bool in_shortfall;
  962 
  963         domain = (uintptr_t)arg;
  964         vmd = VM_DOMAIN(domain);
  965         pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
  966         KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
  967 
  968         shortfall = 0;
  969         in_shortfall = false;
  970         shortfall_cycle = 0;
  971         last_target = target = 0;
  972         nfreed = 0;
  973 
  974         /*
  975          * Calls to these handlers are serialized by the swap syscall lock.
  976          */
  977         (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,
  978             EVENTHANDLER_PRI_ANY);
  979         (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,
  980             EVENTHANDLER_PRI_ANY);
  981 
  982         /*
  983          * The pageout laundry worker is never done, so loop forever.
  984          */
  985         for (;;) {
  986                 KASSERT(target >= 0, ("negative target %d", target));
  987                 KASSERT(shortfall_cycle >= 0,
  988                     ("negative cycle %d", shortfall_cycle));
  989                 launder = 0;
  990 
  991                 /*
  992                  * First determine whether we need to launder pages to meet a
  993                  * shortage of free pages.
  994                  */
  995                 if (shortfall > 0) {
  996                         in_shortfall = true;
  997                         shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
  998                         target = shortfall;
  999                 } else if (!in_shortfall)
 1000                         goto trybackground;
 1001                 else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) {
 1002                         /*
 1003                          * We recently entered shortfall and began laundering
 1004                          * pages.  If we have completed that laundering run
 1005                          * (and we are no longer in shortfall) or we have met
 1006                          * our laundry target through other activity, then we
 1007                          * can stop laundering pages.
 1008                          */
 1009                         in_shortfall = false;
 1010                         target = 0;
 1011                         goto trybackground;
 1012                 }
 1013                 launder = target / shortfall_cycle--;
 1014                 goto dolaundry;
 1015 
 1016                 /*
 1017                  * There's no immediate need to launder any pages; see if we
 1018                  * meet the conditions to perform background laundering:
 1019                  *
 1020                  * 1. The ratio of dirty to clean inactive pages exceeds the
 1021                  *    background laundering threshold, or
 1022                  * 2. we haven't yet reached the target of the current
 1023                  *    background laundering run.
 1024                  *
 1025                  * The background laundering threshold is not a constant.
 1026                  * Instead, it is a slowly growing function of the number of
 1027                  * clean pages freed by the page daemon since the last
 1028                  * background laundering.  Thus, as the ratio of dirty to
 1029                  * clean inactive pages grows, the amount of memory pressure
 1030                  * required to trigger laundering decreases.  We ensure
 1031                  * that the threshold is non-zero after an inactive queue
 1032                  * scan, even if that scan failed to free a single clean page.
 1033                  */
 1034 trybackground:
 1035                 nclean = vmd->vmd_free_count +
 1036                     vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;
 1037                 ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;
 1038                 if (target == 0 && ndirty * isqrt(howmany(nfreed + 1,
 1039                     vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) {
 1040                         target = vmd->vmd_background_launder_target;
 1041                 }
 1042 
 1043                 /*
 1044                  * We have a non-zero background laundering target.  If we've
 1045                  * laundered up to our maximum without observing a page daemon
 1046                  * request, just stop.  This is a safety belt that ensures we
 1047                  * don't launder an excessive amount if memory pressure is low
 1048                  * and the ratio of dirty to clean pages is large.  Otherwise,
 1049                  * proceed at the background laundering rate.
 1050                  */
 1051                 if (target > 0) {
 1052                         if (nfreed > 0) {
 1053                                 nfreed = 0;
 1054                                 last_target = target;
 1055                         } else if (last_target - target >=
 1056                             vm_background_launder_max * PAGE_SIZE / 1024) {
 1057                                 target = 0;
 1058                         }
 1059                         launder = vm_background_launder_rate * PAGE_SIZE / 1024;
 1060                         launder /= VM_LAUNDER_RATE;
 1061                         if (launder > target)
 1062                                 launder = target;
 1063                 }
 1064 
 1065 dolaundry:
 1066                 if (launder > 0) {
 1067                         /*
 1068                          * Because of I/O clustering, the number of laundered
 1069                          * pages could exceed "target" by the maximum size of
 1070                          * a cluster minus one. 
 1071                          */
 1072                         target -= min(vm_pageout_launder(vmd, launder,
 1073                             in_shortfall), target);
 1074                         pause("laundp", hz / VM_LAUNDER_RATE);
 1075                 }
 1076 
 1077                 /*
 1078                  * If we're not currently laundering pages and the page daemon
 1079                  * hasn't posted a new request, sleep until the page daemon
 1080                  * kicks us.
 1081                  */
 1082                 vm_pagequeue_lock(pq);
 1083                 if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)
 1084                         (void)mtx_sleep(&vmd->vmd_laundry_request,
 1085                             vm_pagequeue_lockptr(pq), PVM, "launds", 0);
 1086 
 1087                 /*
 1088                  * If the pagedaemon has indicated that it's in shortfall, start
 1089                  * a shortfall laundering unless we're already in the middle of
 1090                  * one.  This may preempt a background laundering.
 1091                  */
 1092                 if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&
 1093                     (!in_shortfall || shortfall_cycle == 0)) {
 1094                         shortfall = vm_laundry_target(vmd) +
 1095                             vmd->vmd_pageout_deficit;
 1096                         target = 0;
 1097                 } else
 1098                         shortfall = 0;
 1099 
 1100                 if (target == 0)
 1101                         vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;
 1102                 nfreed += vmd->vmd_clean_pages_freed;
 1103                 vmd->vmd_clean_pages_freed = 0;
 1104                 vm_pagequeue_unlock(pq);
 1105         }
 1106 }
 1107 
 1108 /*
 1109  * Compute the number of pages we want to try to move from the
 1110  * active queue to either the inactive or laundry queue.
 1111  *
 1112  * When scanning active pages during a shortage, we make clean pages
 1113  * count more heavily towards the page shortage than dirty pages.
 1114  * This is because dirty pages must be laundered before they can be
 1115  * reused and thus have less utility when attempting to quickly
 1116  * alleviate a free page shortage.  However, this weighting also
 1117  * causes the scan to deactivate dirty pages more aggressively,
 1118  * improving the effectiveness of clustering.
 1119  */
 1120 static int
 1121 vm_pageout_active_target(struct vm_domain *vmd)
 1122 {
 1123         int shortage;
 1124 
 1125         shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) -
 1126             (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt +
 1127             vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight);
 1128         shortage *= act_scan_laundry_weight;
 1129         return (shortage);
 1130 }
 1131 
 1132 /*
 1133  * Scan the active queue.  If there is no shortage of inactive pages, scan a
 1134  * small portion of the queue in order to maintain quasi-LRU.
 1135  */
 1136 static void
 1137 vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
 1138 {
 1139         struct scan_state ss;
 1140         struct mtx *mtx;
 1141         vm_page_t m, marker;
 1142         struct vm_pagequeue *pq;
 1143         long min_scan;
 1144         int act_delta, max_scan, scan_tick;
 1145 
 1146         marker = &vmd->vmd_markers[PQ_ACTIVE];
 1147         pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 1148         vm_pagequeue_lock(pq);
 1149 
 1150         /*
 1151          * If we're just idle polling attempt to visit every
 1152          * active page within 'update_period' seconds.
 1153          */
 1154         scan_tick = ticks;
 1155         if (vm_pageout_update_period != 0) {
 1156                 min_scan = pq->pq_cnt;
 1157                 min_scan *= scan_tick - vmd->vmd_last_active_scan;
 1158                 min_scan /= hz * vm_pageout_update_period;
 1159         } else
 1160                 min_scan = 0;
 1161         if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0))
 1162                 vmd->vmd_last_active_scan = scan_tick;
 1163 
 1164         /*
 1165          * Scan the active queue for pages that can be deactivated.  Update
 1166          * the per-page activity counter and use it to identify deactivation
 1167          * candidates.  Held pages may be deactivated.
 1168          *
 1169          * To avoid requeuing each page that remains in the active queue, we
 1170          * implement the CLOCK algorithm.  To keep the implementation of the
 1171          * enqueue operation consistent for all page queues, we use two hands,
 1172          * represented by marker pages. Scans begin at the first hand, which
 1173          * precedes the second hand in the queue.  When the two hands meet,
 1174          * they are moved back to the head and tail of the queue, respectively,
 1175          * and scanning resumes.
 1176          */
 1177         max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan;
 1178         mtx = NULL;
 1179 act_scan:
 1180         vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
 1181         while ((m = vm_pageout_next(&ss, false)) != NULL) {
 1182                 if (__predict_false(m == &vmd->vmd_clock[1])) {
 1183                         vm_pagequeue_lock(pq);
 1184                         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 1185                         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
 1186                         TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
 1187                             plinks.q);
 1188                         TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
 1189                             plinks.q);
 1190                         max_scan -= ss.scanned;
 1191                         vm_pageout_end_scan(&ss);
 1192                         goto act_scan;
 1193                 }
 1194                 if (__predict_false((m->flags & PG_MARKER) != 0))
 1195                         continue;
 1196 
 1197                 vm_page_change_lock(m, &mtx);
 1198 
 1199                 /*
 1200                  * The page may have been disassociated from the queue
 1201                  * while locks were dropped.
 1202                  */
 1203                 if (vm_page_queue(m) != PQ_ACTIVE)
 1204                         continue;
 1205 
 1206                 /*
 1207                  * Wired pages are dequeued lazily.
 1208                  */
 1209                 if (vm_page_wired(m)) {
 1210                         vm_page_dequeue_deferred(m);
 1211                         continue;
 1212                 }
 1213 
 1214                 /*
 1215                  * Check to see "how much" the page has been used.
 1216                  *
 1217                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 1218                  * that a reference from a concurrently destroyed mapping is
 1219                  * observed here and now.
 1220                  *
 1221                  * Perform an unsynchronized object ref count check.  While
 1222                  * the page lock ensures that the page is not reallocated to
 1223                  * another object, in particular, one with unmanaged mappings
 1224                  * that cannot support pmap_ts_referenced(), two races are,
 1225                  * nonetheless, possible:
 1226                  * 1) The count was transitioning to zero, but we saw a non-
 1227                  *    zero value.  pmap_ts_referenced() will return zero
 1228                  *    because the page is not mapped.
 1229                  * 2) The count was transitioning to one, but we saw zero.
 1230                  *    This race delays the detection of a new reference.  At
 1231                  *    worst, we will deactivate and reactivate the page.
 1232                  */
 1233                 if (m->object->ref_count != 0)
 1234                         act_delta = pmap_ts_referenced(m);
 1235                 else
 1236                         act_delta = 0;
 1237                 if ((m->aflags & PGA_REFERENCED) != 0) {
 1238                         vm_page_aflag_clear(m, PGA_REFERENCED);
 1239                         act_delta++;
 1240                 }
 1241 
 1242                 /*
 1243                  * Advance or decay the act_count based on recent usage.
 1244                  */
 1245                 if (act_delta != 0) {
 1246                         m->act_count += ACT_ADVANCE + act_delta;
 1247                         if (m->act_count > ACT_MAX)
 1248                                 m->act_count = ACT_MAX;
 1249                 } else
 1250                         m->act_count -= min(m->act_count, ACT_DECLINE);
 1251 
 1252                 if (m->act_count == 0) {
 1253                         /*
 1254                          * When not short for inactive pages, let dirty pages go
 1255                          * through the inactive queue before moving to the
 1256                          * laundry queues.  This gives them some extra time to
 1257                          * be reactivated, potentially avoiding an expensive
 1258                          * pageout.  However, during a page shortage, the
 1259                          * inactive queue is necessarily small, and so dirty
 1260                          * pages would only spend a trivial amount of time in
 1261                          * the inactive queue.  Therefore, we might as well
 1262                          * place them directly in the laundry queue to reduce
 1263                          * queuing overhead.
 1264                          */
 1265                         if (page_shortage <= 0)
 1266                                 vm_page_deactivate(m);
 1267                         else {
 1268                                 /*
 1269                                  * Calling vm_page_test_dirty() here would
 1270                                  * require acquisition of the object's write
 1271                                  * lock.  However, during a page shortage,
 1272                                  * directing dirty pages into the laundry
 1273                                  * queue is only an optimization and not a
 1274                                  * requirement.  Therefore, we simply rely on
 1275                                  * the opportunistic updates to the page's
 1276                                  * dirty field by the pmap.
 1277                                  */
 1278                                 if (m->dirty == 0) {
 1279                                         vm_page_deactivate(m);
 1280                                         page_shortage -=
 1281                                             act_scan_laundry_weight;
 1282                                 } else {
 1283                                         vm_page_launder(m);
 1284                                         page_shortage--;
 1285                                 }
 1286                         }
 1287                 }
 1288         }
 1289         if (mtx != NULL) {
 1290                 mtx_unlock(mtx);
 1291                 mtx = NULL;
 1292         }
 1293         vm_pagequeue_lock(pq);
 1294         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 1295         TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
 1296         vm_pageout_end_scan(&ss);
 1297         vm_pagequeue_unlock(pq);
 1298 }
 1299 
 1300 static int
 1301 vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m)
 1302 {
 1303         struct vm_domain *vmd;
 1304 
 1305         if (m->queue != PQ_INACTIVE || (m->aflags & PGA_ENQUEUED) != 0)
 1306                 return (0);
 1307         vm_page_aflag_set(m, PGA_ENQUEUED);
 1308         if ((m->aflags & PGA_REQUEUE_HEAD) != 0) {
 1309                 vmd = vm_pagequeue_domain(m);
 1310                 TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
 1311                 vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 1312         } else if ((m->aflags & PGA_REQUEUE) != 0) {
 1313                 TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q);
 1314                 vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 1315         } else
 1316                 TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q);
 1317         return (1);
 1318 }
 1319 
 1320 /*
 1321  * Re-add stuck pages to the inactive queue.  We will examine them again
 1322  * during the next scan.  If the queue state of a page has changed since
 1323  * it was physically removed from the page queue in
 1324  * vm_pageout_collect_batch(), don't do anything with that page.
 1325  */
 1326 static void
 1327 vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
 1328     vm_page_t m)
 1329 {
 1330         struct vm_pagequeue *pq;
 1331         int delta;
 1332 
 1333         delta = 0;
 1334         pq = ss->pq;
 1335 
 1336         if (m != NULL) {
 1337                 if (vm_batchqueue_insert(bq, m))
 1338                         return;
 1339                 vm_pagequeue_lock(pq);
 1340                 delta += vm_pageout_reinsert_inactive_page(ss, m);
 1341         } else
 1342                 vm_pagequeue_lock(pq);
 1343         while ((m = vm_batchqueue_pop(bq)) != NULL)
 1344                 delta += vm_pageout_reinsert_inactive_page(ss, m);
 1345         vm_pagequeue_cnt_add(pq, delta);
 1346         vm_pagequeue_unlock(pq);
 1347         vm_batchqueue_init(bq);
 1348 }
 1349 
 1350 /*
 1351  * Attempt to reclaim the requested number of pages from the inactive queue.
 1352  * Returns true if the shortage was addressed.
 1353  */
 1354 static int
 1355 vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
 1356     int *addl_shortage)
 1357 {
 1358         struct scan_state ss;
 1359         struct vm_batchqueue rq;
 1360         struct mtx *mtx;
 1361         vm_page_t m, marker;
 1362         struct vm_pagequeue *pq;
 1363         vm_object_t object;
 1364         int act_delta, addl_page_shortage, deficit, page_shortage;
 1365         int starting_page_shortage;
 1366 
 1367         /*
 1368          * The addl_page_shortage is an estimate of the number of temporarily
 1369          * stuck pages in the inactive queue.  In other words, the
 1370          * number of pages from the inactive count that should be
 1371          * discounted in setting the target for the active queue scan.
 1372          */
 1373         addl_page_shortage = 0;
 1374 
 1375         /*
 1376          * vmd_pageout_deficit counts the number of pages requested in
 1377          * allocations that failed because of a free page shortage.  We assume
 1378          * that the allocations will be reattempted and thus include the deficit
 1379          * in our scan target.
 1380          */
 1381         deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
 1382         starting_page_shortage = page_shortage = shortage + deficit;
 1383 
 1384         mtx = NULL;
 1385         object = NULL;
 1386         vm_batchqueue_init(&rq);
 1387 
 1388         /*
 1389          * Start scanning the inactive queue for pages that we can free.  The
 1390          * scan will stop when we reach the target or we have scanned the
 1391          * entire queue.  (Note that m->act_count is not used to make
 1392          * decisions for the inactive queue, only for the active queue.)
 1393          */
 1394         marker = &vmd->vmd_markers[PQ_INACTIVE];
 1395         pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 1396         vm_pagequeue_lock(pq);
 1397         vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 1398         while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) {
 1399                 KASSERT((m->flags & PG_MARKER) == 0,
 1400                     ("marker page %p was dequeued", m));
 1401 
 1402                 vm_page_change_lock(m, &mtx);
 1403 
 1404 recheck:
 1405                 /*
 1406                  * The page may have been disassociated from the queue
 1407                  * while locks were dropped.
 1408                  */
 1409                 if (vm_page_queue(m) != PQ_INACTIVE) {
 1410                         addl_page_shortage++;
 1411                         continue;
 1412                 }
 1413 
 1414                 /*
 1415                  * The page was re-enqueued after the page queue lock was
 1416                  * dropped, or a requeue was requested.  This page gets a second
 1417                  * chance.
 1418                  */
 1419                 if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE |
 1420                     PGA_REQUEUE_HEAD)) != 0)
 1421                         goto reinsert;
 1422 
 1423                 /*
 1424                  * Held pages are essentially stuck in the queue.  So,
 1425                  * they ought to be discounted from the inactive count.
 1426                  * See the description of addl_page_shortage above.
 1427                  *
 1428                  * Wired pages may not be freed.  Complete their removal
 1429                  * from the queue now to avoid needless revisits during
 1430                  * future scans.
 1431                  */
 1432                 if (m->hold_count != 0) {
 1433                         addl_page_shortage++;
 1434                         goto reinsert;
 1435                 }
 1436                 if (vm_page_wired(m)) {
 1437                         vm_page_dequeue_deferred(m);
 1438                         continue;
 1439                 }
 1440 
 1441                 if (object != m->object) {
 1442                         if (object != NULL)
 1443                                 VM_OBJECT_WUNLOCK(object);
 1444                         object = m->object;
 1445                         if (!VM_OBJECT_TRYWLOCK(object)) {
 1446                                 mtx_unlock(mtx);
 1447                                 /* Depends on type-stability. */
 1448                                 VM_OBJECT_WLOCK(object);
 1449                                 mtx_lock(mtx);
 1450                                 goto recheck;
 1451                         }
 1452                 }
 1453 
 1454                 if (vm_page_busied(m)) {
 1455                         /*
 1456                          * Don't mess with busy pages.  Leave them at
 1457                          * the front of the queue.  Most likely, they
 1458                          * are being paged out and will leave the
 1459                          * queue shortly after the scan finishes.  So,
 1460                          * they ought to be discounted from the
 1461                          * inactive count.
 1462                          */
 1463                         addl_page_shortage++;
 1464                         goto reinsert;
 1465                 }
 1466 
 1467                 /*
 1468                  * Invalid pages can be easily freed. They cannot be
 1469                  * mapped, vm_page_free() asserts this.
 1470                  */
 1471                 if (m->valid == 0)
 1472                         goto free_page;
 1473 
 1474                 /*
 1475                  * If the page has been referenced and the object is not dead,
 1476                  * reactivate or requeue the page depending on whether the
 1477                  * object is mapped.
 1478                  *
 1479                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 1480                  * that a reference from a concurrently destroyed mapping is
 1481                  * observed here and now.
 1482                  */
 1483                 if (object->ref_count != 0)
 1484                         act_delta = pmap_ts_referenced(m);
 1485                 else {
 1486                         KASSERT(!pmap_page_is_mapped(m),
 1487                             ("page %p is mapped", m));
 1488                         act_delta = 0;
 1489                 }
 1490                 if ((m->aflags & PGA_REFERENCED) != 0) {
 1491                         vm_page_aflag_clear(m, PGA_REFERENCED);
 1492                         act_delta++;
 1493                 }
 1494                 if (act_delta != 0) {
 1495                         if (object->ref_count != 0) {
 1496                                 VM_CNT_INC(v_reactivated);
 1497                                 vm_page_activate(m);
 1498 
 1499                                 /*
 1500                                  * Increase the activation count if the page
 1501                                  * was referenced while in the inactive queue.
 1502                                  * This makes it less likely that the page will
 1503                                  * be returned prematurely to the inactive
 1504                                  * queue.
 1505                                  */
 1506                                 m->act_count += act_delta + ACT_ADVANCE;
 1507                                 continue;
 1508                         } else if ((object->flags & OBJ_DEAD) == 0) {
 1509                                 vm_page_aflag_set(m, PGA_REQUEUE);
 1510                                 goto reinsert;
 1511                         }
 1512                 }
 1513 
 1514                 /*
 1515                  * If the page appears to be clean at the machine-independent
 1516                  * layer, then remove all of its mappings from the pmap in
 1517                  * anticipation of freeing it.  If, however, any of the page's
 1518                  * mappings allow write access, then the page may still be
 1519                  * modified until the last of those mappings are removed.
 1520                  */
 1521                 if (object->ref_count != 0) {
 1522                         vm_page_test_dirty(m);
 1523                         if (m->dirty == 0)
 1524                                 pmap_remove_all(m);
 1525                 }
 1526 
 1527                 /*
 1528                  * Clean pages can be freed, but dirty pages must be sent back
 1529                  * to the laundry, unless they belong to a dead object.
 1530                  * Requeueing dirty pages from dead objects is pointless, as
 1531                  * they are being paged out and freed by the thread that
 1532                  * destroyed the object.
 1533                  */
 1534                 if (m->dirty == 0) {
 1535 free_page:
 1536                         /*
 1537                          * Because we dequeued the page and have already
 1538                          * checked for concurrent dequeue and enqueue
 1539                          * requests, we can safely disassociate the page
 1540                          * from the inactive queue.
 1541                          */
 1542                         KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
 1543                             ("page %p has queue state", m));
 1544                         m->queue = PQ_NONE;
 1545                         vm_page_free(m);
 1546                         page_shortage--;
 1547                 } else if ((object->flags & OBJ_DEAD) == 0)
 1548                         vm_page_launder(m);
 1549                 continue;
 1550 reinsert:
 1551                 vm_pageout_reinsert_inactive(&ss, &rq, m);
 1552         }
 1553         if (mtx != NULL)
 1554                 mtx_unlock(mtx);
 1555         if (object != NULL)
 1556                 VM_OBJECT_WUNLOCK(object);
 1557         vm_pageout_reinsert_inactive(&ss, &rq, NULL);
 1558         vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
 1559         vm_pagequeue_lock(pq);
 1560         vm_pageout_end_scan(&ss);
 1561         vm_pagequeue_unlock(pq);
 1562 
 1563         VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage);
 1564 
 1565         /*
 1566          * Wake up the laundry thread so that it can perform any needed
 1567          * laundering.  If we didn't meet our target, we're in shortfall and
 1568          * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
 1569          * swap devices are configured, the laundry thread has no work to do, so
 1570          * don't bother waking it up.
 1571          *
 1572          * The laundry thread uses the number of inactive queue scans elapsed
 1573          * since the last laundering to determine whether to launder again, so
 1574          * keep count.
 1575          */
 1576         if (starting_page_shortage > 0) {
 1577                 pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 1578                 vm_pagequeue_lock(pq);
 1579                 if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&
 1580                     (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) {
 1581                         if (page_shortage > 0) {
 1582                                 vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;
 1583                                 VM_CNT_INC(v_pdshortfalls);
 1584                         } else if (vmd->vmd_laundry_request !=
 1585                             VM_LAUNDRY_SHORTFALL)
 1586                                 vmd->vmd_laundry_request =
 1587                                     VM_LAUNDRY_BACKGROUND;
 1588                         wakeup(&vmd->vmd_laundry_request);
 1589                 }
 1590                 vmd->vmd_clean_pages_freed +=
 1591                     starting_page_shortage - page_shortage;
 1592                 vm_pagequeue_unlock(pq);
 1593         }
 1594 
 1595         /*
 1596          * Wakeup the swapout daemon if we didn't free the targeted number of
 1597          * pages.
 1598          */
 1599         if (page_shortage > 0)
 1600                 vm_swapout_run();
 1601 
 1602         /*
 1603          * If the inactive queue scan fails repeatedly to meet its
 1604          * target, kill the largest process.
 1605          */
 1606         vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
 1607 
 1608         /*
 1609          * Reclaim pages by swapping out idle processes, if configured to do so.
 1610          */
 1611         vm_swapout_run_idle();
 1612 
 1613         /*
 1614          * See the description of addl_page_shortage above.
 1615          */
 1616         *addl_shortage = addl_page_shortage + deficit;
 1617 
 1618         return (page_shortage <= 0);
 1619 }
 1620 
 1621 static int vm_pageout_oom_vote;
 1622 
 1623 /*
 1624  * The pagedaemon threads randlomly select one to perform the
 1625  * OOM.  Trying to kill processes before all pagedaemons
 1626  * failed to reach free target is premature.
 1627  */
 1628 static void
 1629 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
 1630     int starting_page_shortage)
 1631 {
 1632         int old_vote;
 1633 
 1634         if (starting_page_shortage <= 0 || starting_page_shortage !=
 1635             page_shortage)
 1636                 vmd->vmd_oom_seq = 0;
 1637         else
 1638                 vmd->vmd_oom_seq++;
 1639         if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 1640                 if (vmd->vmd_oom) {
 1641                         vmd->vmd_oom = FALSE;
 1642                         atomic_subtract_int(&vm_pageout_oom_vote, 1);
 1643                 }
 1644                 return;
 1645         }
 1646 
 1647         /*
 1648          * Do not follow the call sequence until OOM condition is
 1649          * cleared.
 1650          */
 1651         vmd->vmd_oom_seq = 0;
 1652 
 1653         if (vmd->vmd_oom)
 1654                 return;
 1655 
 1656         vmd->vmd_oom = TRUE;
 1657         old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
 1658         if (old_vote != vm_ndomains - 1)
 1659                 return;
 1660 
 1661         /*
 1662          * The current pagedaemon thread is the last in the quorum to
 1663          * start OOM.  Initiate the selection and signaling of the
 1664          * victim.
 1665          */
 1666         vm_pageout_oom(VM_OOM_MEM);
 1667 
 1668         /*
 1669          * After one round of OOM terror, recall our vote.  On the
 1670          * next pass, current pagedaemon would vote again if the low
 1671          * memory condition is still there, due to vmd_oom being
 1672          * false.
 1673          */
 1674         vmd->vmd_oom = FALSE;
 1675         atomic_subtract_int(&vm_pageout_oom_vote, 1);
 1676 }
 1677 
 1678 /*
 1679  * The OOM killer is the page daemon's action of last resort when
 1680  * memory allocation requests have been stalled for a prolonged period
 1681  * of time because it cannot reclaim memory.  This function computes
 1682  * the approximate number of physical pages that could be reclaimed if
 1683  * the specified address space is destroyed.
 1684  *
 1685  * Private, anonymous memory owned by the address space is the
 1686  * principal resource that we expect to recover after an OOM kill.
 1687  * Since the physical pages mapped by the address space's COW entries
 1688  * are typically shared pages, they are unlikely to be released and so
 1689  * they are not counted.
 1690  *
 1691  * To get to the point where the page daemon runs the OOM killer, its
 1692  * efforts to write-back vnode-backed pages may have stalled.  This
 1693  * could be caused by a memory allocation deadlock in the write path
 1694  * that might be resolved by an OOM kill.  Therefore, physical pages
 1695  * belonging to vnode-backed objects are counted, because they might
 1696  * be freed without being written out first if the address space holds
 1697  * the last reference to an unlinked vnode.
 1698  *
 1699  * Similarly, physical pages belonging to OBJT_PHYS objects are
 1700  * counted because the address space might hold the last reference to
 1701  * the object.
 1702  */
 1703 static long
 1704 vm_pageout_oom_pagecount(struct vmspace *vmspace)
 1705 {
 1706         vm_map_t map;
 1707         vm_map_entry_t entry;
 1708         vm_object_t obj;
 1709         long res;
 1710 
 1711         map = &vmspace->vm_map;
 1712         KASSERT(!map->system_map, ("system map"));
 1713         sx_assert(&map->lock, SA_LOCKED);
 1714         res = 0;
 1715         for (entry = map->header.next; entry != &map->header;
 1716             entry = entry->next) {
 1717                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 1718                         continue;
 1719                 obj = entry->object.vm_object;
 1720                 if (obj == NULL)
 1721                         continue;
 1722                 if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
 1723                     obj->ref_count != 1)
 1724                         continue;
 1725                 switch (obj->type) {
 1726                 case OBJT_DEFAULT:
 1727                 case OBJT_SWAP:
 1728                 case OBJT_PHYS:
 1729                 case OBJT_VNODE:
 1730                         res += obj->resident_page_count;
 1731                         break;
 1732                 }
 1733         }
 1734         return (res);
 1735 }
 1736 
 1737 static int vm_oom_ratelim_last;
 1738 static int vm_oom_pf_secs = 10;
 1739 SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0,
 1740     "");
 1741 static struct mtx vm_oom_ratelim_mtx;
 1742 
 1743 void
 1744 vm_pageout_oom(int shortage)
 1745 {
 1746         struct proc *p, *bigproc;
 1747         vm_offset_t size, bigsize;
 1748         struct thread *td;
 1749         struct vmspace *vm;
 1750         int now;
 1751         bool breakout;
 1752 
 1753         /*
 1754          * For OOM requests originating from vm_fault(), there is a high
 1755          * chance that a single large process faults simultaneously in
 1756          * several threads.  Also, on an active system running many
 1757          * processes of middle-size, like buildworld, all of them
 1758          * could fault almost simultaneously as well.
 1759          *
 1760          * To avoid killing too many processes, rate-limit OOMs
 1761          * initiated by vm_fault() time-outs on the waits for free
 1762          * pages.
 1763          */
 1764         mtx_lock(&vm_oom_ratelim_mtx);
 1765         now = ticks;
 1766         if (shortage == VM_OOM_MEM_PF &&
 1767             (u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) {
 1768                 mtx_unlock(&vm_oom_ratelim_mtx);
 1769                 return;
 1770         }
 1771         vm_oom_ratelim_last = now;
 1772         mtx_unlock(&vm_oom_ratelim_mtx);
 1773 
 1774         /*
 1775          * We keep the process bigproc locked once we find it to keep anyone
 1776          * from messing with it; however, there is a possibility of
 1777          * deadlock if process B is bigproc and one of its child processes
 1778          * attempts to propagate a signal to B while we are waiting for A's
 1779          * lock while walking this list.  To avoid this, we don't block on
 1780          * the process lock but just skip a process if it is already locked.
 1781          */
 1782         bigproc = NULL;
 1783         bigsize = 0;
 1784         sx_slock(&allproc_lock);
 1785         FOREACH_PROC_IN_SYSTEM(p) {
 1786                 PROC_LOCK(p);
 1787 
 1788                 /*
 1789                  * If this is a system, protected or killed process, skip it.
 1790                  */
 1791                 if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 1792                     P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
 1793                     p->p_pid == 1 || P_KILLED(p) ||
 1794                     (p->p_pid < 48 && swap_pager_avail != 0)) {
 1795                         PROC_UNLOCK(p);
 1796                         continue;
 1797                 }
 1798                 /*
 1799                  * If the process is in a non-running type state,
 1800                  * don't touch it.  Check all the threads individually.
 1801                  */
 1802                 breakout = false;
 1803                 FOREACH_THREAD_IN_PROC(p, td) {
 1804                         thread_lock(td);
 1805                         if (!TD_ON_RUNQ(td) &&
 1806                             !TD_IS_RUNNING(td) &&
 1807                             !TD_IS_SLEEPING(td) &&
 1808                             !TD_IS_SUSPENDED(td) &&
 1809                             !TD_IS_SWAPPED(td)) {
 1810                                 thread_unlock(td);
 1811                                 breakout = true;
 1812                                 break;
 1813                         }
 1814                         thread_unlock(td);
 1815                 }
 1816                 if (breakout) {
 1817                         PROC_UNLOCK(p);
 1818                         continue;
 1819                 }
 1820                 /*
 1821                  * get the process size
 1822                  */
 1823                 vm = vmspace_acquire_ref(p);
 1824                 if (vm == NULL) {
 1825                         PROC_UNLOCK(p);
 1826                         continue;
 1827                 }
 1828                 _PHOLD_LITE(p);
 1829                 PROC_UNLOCK(p);
 1830                 sx_sunlock(&allproc_lock);
 1831                 if (!vm_map_trylock_read(&vm->vm_map)) {
 1832                         vmspace_free(vm);
 1833                         sx_slock(&allproc_lock);
 1834                         PRELE(p);
 1835                         continue;
 1836                 }
 1837                 size = vmspace_swap_count(vm);
 1838                 if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF)
 1839                         size += vm_pageout_oom_pagecount(vm);
 1840                 vm_map_unlock_read(&vm->vm_map);
 1841                 vmspace_free(vm);
 1842                 sx_slock(&allproc_lock);
 1843 
 1844                 /*
 1845                  * If this process is bigger than the biggest one,
 1846                  * remember it.
 1847                  */
 1848                 if (size > bigsize) {
 1849                         if (bigproc != NULL)
 1850                                 PRELE(bigproc);
 1851                         bigproc = p;
 1852                         bigsize = size;
 1853                 } else {
 1854                         PRELE(p);
 1855                 }
 1856         }
 1857         sx_sunlock(&allproc_lock);
 1858         if (bigproc != NULL) {
 1859                 if (vm_panic_on_oom != 0)
 1860                         panic("out of swap space");
 1861                 PROC_LOCK(bigproc);
 1862                 killproc(bigproc, "out of swap space");
 1863                 sched_nice(bigproc, PRIO_MIN);
 1864                 _PRELE(bigproc);
 1865                 PROC_UNLOCK(bigproc);
 1866         }
 1867 }
 1868 
 1869 static bool
 1870 vm_pageout_lowmem(void)
 1871 {
 1872         static int lowmem_ticks = 0;
 1873         int last;
 1874 
 1875         last = atomic_load_int(&lowmem_ticks);
 1876         while ((u_int)(ticks - last) / hz >= lowmem_period) {
 1877                 if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0)
 1878                         continue;
 1879 
 1880                 /*
 1881                  * Decrease registered cache sizes.
 1882                  */
 1883                 SDT_PROBE0(vm, , , vm__lowmem_scan);
 1884                 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
 1885 
 1886                 /*
 1887                  * We do this explicitly after the caches have been
 1888                  * drained above.
 1889                  */
 1890                 uma_reclaim();
 1891                 return (true);
 1892         }
 1893         return (false);
 1894 }
 1895 
 1896 static void
 1897 vm_pageout_worker(void *arg)
 1898 {
 1899         struct vm_domain *vmd;
 1900         u_int ofree;
 1901         int addl_shortage, domain, shortage;
 1902         bool target_met;
 1903 
 1904         domain = (uintptr_t)arg;
 1905         vmd = VM_DOMAIN(domain);
 1906         shortage = 0;
 1907         target_met = true;
 1908 
 1909         /*
 1910          * XXXKIB It could be useful to bind pageout daemon threads to
 1911          * the cores belonging to the domain, from which vm_page_array
 1912          * is allocated.
 1913          */
 1914 
 1915         KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 1916         vmd->vmd_last_active_scan = ticks;
 1917 
 1918         /*
 1919          * The pageout daemon worker is never done, so loop forever.
 1920          */
 1921         while (TRUE) {
 1922                 vm_domain_pageout_lock(vmd);
 1923 
 1924                 /*
 1925                  * We need to clear wanted before we check the limits.  This
 1926                  * prevents races with wakers who will check wanted after they
 1927                  * reach the limit.
 1928                  */
 1929                 atomic_store_int(&vmd->vmd_pageout_wanted, 0);
 1930 
 1931                 /*
 1932                  * Might the page daemon need to run again?
 1933                  */
 1934                 if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 1935                         /*
 1936                          * Yes.  If the scan failed to produce enough free
 1937                          * pages, sleep uninterruptibly for some time in the
 1938                          * hope that the laundry thread will clean some pages.
 1939                          */
 1940                         vm_domain_pageout_unlock(vmd);
 1941                         if (!target_met)
 1942                                 pause("pwait", hz / VM_INACT_SCAN_RATE);
 1943                 } else {
 1944                         /*
 1945                          * No, sleep until the next wakeup or until pages
 1946                          * need to have their reference stats updated.
 1947                          */
 1948                         if (mtx_sleep(&vmd->vmd_pageout_wanted,
 1949                             vm_domain_pageout_lockptr(vmd), PDROP | PVM,
 1950                             "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 1951                                 VM_CNT_INC(v_pdwakeups);
 1952                 }
 1953 
 1954                 /* Prevent spurious wakeups by ensuring that wanted is set. */
 1955                 atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 1956 
 1957                 /*
 1958                  * Use the controller to calculate how many pages to free in
 1959                  * this interval, and scan the inactive queue.  If the lowmem
 1960                  * handlers appear to have freed up some pages, subtract the
 1961                  * difference from the inactive queue scan target.
 1962                  */
 1963                 shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
 1964                 if (shortage > 0) {
 1965                         ofree = vmd->vmd_free_count;
 1966                         if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree)
 1967                                 shortage -= min(vmd->vmd_free_count - ofree,
 1968                                     (u_int)shortage);
 1969                         target_met = vm_pageout_scan_inactive(vmd, shortage,
 1970                             &addl_shortage);
 1971                 } else
 1972                         addl_shortage = 0;
 1973 
 1974                 /*
 1975                  * Scan the active queue.  A positive value for shortage
 1976                  * indicates that we must aggressively deactivate pages to avoid
 1977                  * a shortfall.
 1978                  */
 1979                 shortage = vm_pageout_active_target(vmd) + addl_shortage;
 1980                 vm_pageout_scan_active(vmd, shortage);
 1981         }
 1982 }
 1983 
 1984 /*
 1985  *      vm_pageout_init initialises basic pageout daemon settings.
 1986  */
 1987 static void
 1988 vm_pageout_init_domain(int domain)
 1989 {
 1990         struct vm_domain *vmd;
 1991         struct sysctl_oid *oid;
 1992 
 1993         vmd = VM_DOMAIN(domain);
 1994         vmd->vmd_interrupt_free_min = 2;
 1995 
 1996         /*
 1997          * v_free_reserved needs to include enough for the largest
 1998          * swap pager structures plus enough for any pv_entry structs
 1999          * when paging. 
 2000          */
 2001         if (vmd->vmd_page_count > 1024)
 2002                 vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200;
 2003         else
 2004                 vmd->vmd_free_min = 4;
 2005         vmd->vmd_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 2006             vmd->vmd_interrupt_free_min;
 2007         vmd->vmd_free_reserved = vm_pageout_page_count +
 2008             vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768);
 2009         vmd->vmd_free_severe = vmd->vmd_free_min / 2;
 2010         vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;
 2011         vmd->vmd_free_min += vmd->vmd_free_reserved;
 2012         vmd->vmd_free_severe += vmd->vmd_free_reserved;
 2013         vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;
 2014         if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)
 2015                 vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 2016 
 2017         /*
 2018          * Set the default wakeup threshold to be 10% below the paging
 2019          * target.  This keeps the steady state out of shortfall.
 2020          */
 2021         vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 2022 
 2023         /*
 2024          * Target amount of memory to move out of the laundry queue during a
 2025          * background laundering.  This is proportional to the amount of system
 2026          * memory.
 2027          */
 2028         vmd->vmd_background_launder_target = (vmd->vmd_free_target -
 2029             vmd->vmd_free_min) / 10;
 2030 
 2031         /* Initialize the pageout daemon pid controller. */
 2032         pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
 2033             vmd->vmd_free_target, PIDCTRL_BOUND,
 2034             PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
 2035         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
 2036             "pidctrl", CTLFLAG_RD, NULL, "");
 2037         pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 2038 }
 2039 
 2040 static void
 2041 vm_pageout_init(void)
 2042 {
 2043         u_long freecount;
 2044         int i;
 2045 
 2046         /*
 2047          * Initialize some paging parameters.
 2048          */
 2049         if (vm_cnt.v_page_count < 2000)
 2050                 vm_pageout_page_count = 8;
 2051 
 2052         freecount = 0;
 2053         for (i = 0; i < vm_ndomains; i++) {
 2054                 struct vm_domain *vmd;
 2055 
 2056                 vm_pageout_init_domain(i);
 2057                 vmd = VM_DOMAIN(i);
 2058                 vm_cnt.v_free_reserved += vmd->vmd_free_reserved;
 2059                 vm_cnt.v_free_target += vmd->vmd_free_target;
 2060                 vm_cnt.v_free_min += vmd->vmd_free_min;
 2061                 vm_cnt.v_inactive_target += vmd->vmd_inactive_target;
 2062                 vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;
 2063                 vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;
 2064                 vm_cnt.v_free_severe += vmd->vmd_free_severe;
 2065                 freecount += vmd->vmd_free_count;
 2066         }
 2067 
 2068         /*
 2069          * Set interval in seconds for active scan.  We want to visit each
 2070          * page at least once every ten minutes.  This is to prevent worst
 2071          * case paging behaviors with stale active LRU.
 2072          */
 2073         if (vm_pageout_update_period == 0)
 2074                 vm_pageout_update_period = 600;
 2075 
 2076         /*
 2077          * Set the maximum number of user-wired virtual pages.  Historically the
 2078          * main source of such pages was mlock(2) and mlockall(2).  Hypervisors
 2079          * may also request user-wired memory.
 2080          */
 2081         if (vm_page_max_user_wired == 0)
 2082                 vm_page_max_user_wired = 4 * freecount / 5;
 2083 }
 2084 
 2085 /*
 2086  *     vm_pageout is the high level pageout daemon.
 2087  */
 2088 static void
 2089 vm_pageout(void)
 2090 {
 2091         struct proc *p;
 2092         struct thread *td;
 2093         int error, first, i;
 2094 
 2095         p = curproc;
 2096         td = curthread;
 2097 
 2098         mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF);
 2099         swap_pager_swap_init();
 2100         for (first = -1, i = 0; i < vm_ndomains; i++) {
 2101                 if (VM_DOMAIN_EMPTY(i)) {
 2102                         if (bootverbose)
 2103                                 printf("domain %d empty; skipping pageout\n",
 2104                                     i);
 2105                         continue;
 2106                 }
 2107                 if (first == -1)
 2108                         first = i;
 2109                 else {
 2110                         error = kthread_add(vm_pageout_worker,
 2111                             (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i);
 2112                         if (error != 0)
 2113                                 panic("starting pageout for domain %d: %d\n",
 2114                                     i, error);
 2115                 }
 2116                 error = kthread_add(vm_pageout_laundry_worker,
 2117                     (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i);
 2118                 if (error != 0)
 2119                         panic("starting laundry for domain %d: %d", i, error);
 2120         }
 2121         error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma");
 2122         if (error != 0)
 2123                 panic("starting uma_reclaim helper, error %d\n", error);
 2124 
 2125         snprintf(td->td_name, sizeof(td->td_name), "dom%d", first);
 2126         vm_pageout_worker((void *)(uintptr_t)first);
 2127 }
 2128 
 2129 /*
 2130  * Perform an advisory wakeup of the page daemon.
 2131  */
 2132 void
 2133 pagedaemon_wakeup(int domain)
 2134 {
 2135         struct vm_domain *vmd;
 2136 
 2137         vmd = VM_DOMAIN(domain);
 2138         vm_domain_pageout_assert_unlocked(vmd);
 2139         if (curproc == pageproc)
 2140                 return;
 2141 
 2142         if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {
 2143                 vm_domain_pageout_lock(vmd);
 2144                 atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 2145                 wakeup(&vmd->vmd_pageout_wanted);
 2146                 vm_domain_pageout_unlock(vmd);
 2147         }
 2148 }

Cache object: 154f5e5046efe9a18a5ca53cdb153a4d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.