The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_pageout.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
    3  *
    4  * Copyright (c) 1991 Regents of the University of California.
    5  * All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  * Copyright (c) 2005 Yahoo! Technologies Norway AS
   11  * All rights reserved.
   12  *
   13  * This code is derived from software contributed to Berkeley by
   14  * The Mach Operating System project at Carnegie-Mellon University.
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  * 1. Redistributions of source code must retain the above copyright
   20  *    notice, this list of conditions and the following disclaimer.
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  * 3. All advertising materials mentioning features or use of this software
   25  *    must display the following acknowledgement:
   26  *      This product includes software developed by the University of
   27  *      California, Berkeley and its contributors.
   28  * 4. Neither the name of the University nor the names of its contributors
   29  *    may be used to endorse or promote products derived from this software
   30  *    without specific prior written permission.
   31  *
   32  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   42  * SUCH DAMAGE.
   43  *
   44  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
   45  *
   46  *
   47  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   48  * All rights reserved.
   49  *
   50  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   51  *
   52  * Permission to use, copy, modify and distribute this software and
   53  * its documentation is hereby granted, provided that both the copyright
   54  * notice and this permission notice appear in all copies of the
   55  * software, derivative works or modified versions, and any portions
   56  * thereof, and that both notices appear in supporting documentation.
   57  *
   58  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   59  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   60  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   61  *
   62  * Carnegie Mellon requests users of this software to return to
   63  *
   64  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   65  *  School of Computer Science
   66  *  Carnegie Mellon University
   67  *  Pittsburgh PA 15213-3890
   68  *
   69  * any improvements or extensions that they make and grant Carnegie the
   70  * rights to redistribute these changes.
   71  */
   72 
   73 /*
   74  *      The proverbial page-out daemon.
   75  */
   76 
   77 #include <sys/cdefs.h>
   78 __FBSDID("$FreeBSD: head/sys/vm/vm_pageout.c 344440 2019-02-21 15:44:32Z markj $");
   79 
   80 #include "opt_vm.h"
   81 
   82 #include <sys/param.h>
   83 #include <sys/systm.h>
   84 #include <sys/kernel.h>
   85 #include <sys/eventhandler.h>
   86 #include <sys/lock.h>
   87 #include <sys/mutex.h>
   88 #include <sys/proc.h>
   89 #include <sys/kthread.h>
   90 #include <sys/ktr.h>
   91 #include <sys/mount.h>
   92 #include <sys/racct.h>
   93 #include <sys/resourcevar.h>
   94 #include <sys/sched.h>
   95 #include <sys/sdt.h>
   96 #include <sys/signalvar.h>
   97 #include <sys/smp.h>
   98 #include <sys/time.h>
   99 #include <sys/vnode.h>
  100 #include <sys/vmmeter.h>
  101 #include <sys/rwlock.h>
  102 #include <sys/sx.h>
  103 #include <sys/sysctl.h>
  104 
  105 #include <vm/vm.h>
  106 #include <vm/vm_param.h>
  107 #include <vm/vm_object.h>
  108 #include <vm/vm_page.h>
  109 #include <vm/vm_map.h>
  110 #include <vm/vm_pageout.h>
  111 #include <vm/vm_pager.h>
  112 #include <vm/vm_phys.h>
  113 #include <vm/vm_pagequeue.h>
  114 #include <vm/swap_pager.h>
  115 #include <vm/vm_extern.h>
  116 #include <vm/uma.h>
  117 
  118 /*
  119  * System initialization
  120  */
  121 
  122 /* the kernel process "vm_pageout"*/
  123 static void vm_pageout(void);
  124 static void vm_pageout_init(void);
  125 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
  126 static int vm_pageout_cluster(vm_page_t m);
  127 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
  128     int starting_page_shortage);
  129 
  130 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
  131     NULL);
  132 
  133 struct proc *pageproc;
  134 
  135 static struct kproc_desc page_kp = {
  136         "pagedaemon",
  137         vm_pageout,
  138         &pageproc
  139 };
  140 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
  141     &page_kp);
  142 
  143 SDT_PROVIDER_DEFINE(vm);
  144 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
  145 
  146 /* Pagedaemon activity rates, in subdivisions of one second. */
  147 #define VM_LAUNDER_RATE         10
  148 #define VM_INACT_SCAN_RATE      10
  149 
  150 static int vm_pageout_oom_seq = 12;
  151 
  152 static int vm_pageout_update_period;
  153 static int disable_swap_pageouts;
  154 static int lowmem_period = 10;
  155 static int swapdev_enabled;
  156 
  157 static int vm_panic_on_oom = 0;
  158 
  159 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
  160         CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
  161         "panic on out of memory instead of killing the largest process");
  162 
  163 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
  164         CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
  165         "Maximum active LRU update period");
  166   
  167 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
  168         "Low memory callback period");
  169 
  170 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
  171         CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
  172 
  173 static int pageout_lock_miss;
  174 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
  175         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
  176 
  177 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
  178         CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
  179         "back-to-back calls to oom detector to start OOM");
  180 
  181 static int act_scan_laundry_weight = 3;
  182 SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
  183     &act_scan_laundry_weight, 0,
  184     "weight given to clean vs. dirty pages in active queue scans");
  185 
  186 static u_int vm_background_launder_rate = 4096;
  187 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
  188     &vm_background_launder_rate, 0,
  189     "background laundering rate, in kilobytes per second");
  190 
  191 static u_int vm_background_launder_max = 20 * 1024;
  192 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
  193     &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
  194 
  195 int vm_pageout_page_count = 32;
  196 
  197 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
  198 SYSCTL_INT(_vm, OID_AUTO, max_wired,
  199         CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
  200 
  201 static u_int isqrt(u_int num);
  202 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
  203     bool in_shortfall);
  204 static void vm_pageout_laundry_worker(void *arg);
  205 
  206 struct scan_state {
  207         struct vm_batchqueue bq;
  208         struct vm_pagequeue *pq;
  209         vm_page_t       marker;
  210         int             maxscan;
  211         int             scanned;
  212 };
  213 
  214 static void
  215 vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
  216     vm_page_t marker, vm_page_t after, int maxscan)
  217 {
  218 
  219         vm_pagequeue_assert_locked(pq);
  220         KASSERT((marker->aflags & PGA_ENQUEUED) == 0,
  221             ("marker %p already enqueued", marker));
  222 
  223         if (after == NULL)
  224                 TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
  225         else
  226                 TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
  227         vm_page_aflag_set(marker, PGA_ENQUEUED);
  228 
  229         vm_batchqueue_init(&ss->bq);
  230         ss->pq = pq;
  231         ss->marker = marker;
  232         ss->maxscan = maxscan;
  233         ss->scanned = 0;
  234         vm_pagequeue_unlock(pq);
  235 }
  236 
  237 static void
  238 vm_pageout_end_scan(struct scan_state *ss)
  239 {
  240         struct vm_pagequeue *pq;
  241 
  242         pq = ss->pq;
  243         vm_pagequeue_assert_locked(pq);
  244         KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
  245             ("marker %p not enqueued", ss->marker));
  246 
  247         TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
  248         vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
  249         pq->pq_pdpages += ss->scanned;
  250 }
  251 
  252 /*
  253  * Add a small number of queued pages to a batch queue for later processing
  254  * without the corresponding queue lock held.  The caller must have enqueued a
  255  * marker page at the desired start point for the scan.  Pages will be
  256  * physically dequeued if the caller so requests.  Otherwise, the returned
  257  * batch may contain marker pages, and it is up to the caller to handle them.
  258  *
  259  * When processing the batch queue, vm_page_queue() must be used to
  260  * determine whether the page has been logically dequeued by another thread.
  261  * Once this check is performed, the page lock guarantees that the page will
  262  * not be disassociated from the queue.
  263  */
  264 static __always_inline void
  265 vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
  266 {
  267         struct vm_pagequeue *pq;
  268         vm_page_t m, marker;
  269 
  270         marker = ss->marker;
  271         pq = ss->pq;
  272 
  273         KASSERT((marker->aflags & PGA_ENQUEUED) != 0,
  274             ("marker %p not enqueued", ss->marker));
  275 
  276         vm_pagequeue_lock(pq);
  277         for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
  278             ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
  279             m = TAILQ_NEXT(m, plinks.q), ss->scanned++) {
  280                 if ((m->flags & PG_MARKER) == 0) {
  281                         KASSERT((m->aflags & PGA_ENQUEUED) != 0,
  282                             ("page %p not enqueued", m));
  283                         KASSERT((m->flags & PG_FICTITIOUS) == 0,
  284                             ("Fictitious page %p cannot be in page queue", m));
  285                         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  286                             ("Unmanaged page %p cannot be in page queue", m));
  287                 } else if (dequeue)
  288                         continue;
  289 
  290                 (void)vm_batchqueue_insert(&ss->bq, m);
  291                 if (dequeue) {
  292                         TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
  293                         vm_page_aflag_clear(m, PGA_ENQUEUED);
  294                 }
  295         }
  296         TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
  297         if (__predict_true(m != NULL))
  298                 TAILQ_INSERT_BEFORE(m, marker, plinks.q);
  299         else
  300                 TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
  301         if (dequeue)
  302                 vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
  303         vm_pagequeue_unlock(pq);
  304 }
  305 
  306 /* Return the next page to be scanned, or NULL if the scan is complete. */
  307 static __always_inline vm_page_t
  308 vm_pageout_next(struct scan_state *ss, const bool dequeue)
  309 {
  310 
  311         if (ss->bq.bq_cnt == 0)
  312                 vm_pageout_collect_batch(ss, dequeue);
  313         return (vm_batchqueue_pop(&ss->bq));
  314 }
  315 
  316 /*
  317  * Scan for pages at adjacent offsets within the given page's object that are
  318  * eligible for laundering, form a cluster of these pages and the given page,
  319  * and launder that cluster.
  320  */
  321 static int
  322 vm_pageout_cluster(vm_page_t m)
  323 {
  324         vm_object_t object;
  325         vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
  326         vm_pindex_t pindex;
  327         int ib, is, page_base, pageout_count;
  328 
  329         vm_page_assert_locked(m);
  330         object = m->object;
  331         VM_OBJECT_ASSERT_WLOCKED(object);
  332         pindex = m->pindex;
  333 
  334         vm_page_assert_unbusied(m);
  335         KASSERT(!vm_page_held(m), ("page %p is held", m));
  336 
  337         pmap_remove_write(m);
  338         vm_page_unlock(m);
  339 
  340         mc[vm_pageout_page_count] = pb = ps = m;
  341         pageout_count = 1;
  342         page_base = vm_pageout_page_count;
  343         ib = 1;
  344         is = 1;
  345 
  346         /*
  347          * We can cluster only if the page is not clean, busy, or held, and
  348          * the page is in the laundry queue.
  349          *
  350          * During heavy mmap/modification loads the pageout
  351          * daemon can really fragment the underlying file
  352          * due to flushing pages out of order and not trying to
  353          * align the clusters (which leaves sporadic out-of-order
  354          * holes).  To solve this problem we do the reverse scan
  355          * first and attempt to align our cluster, then do a 
  356          * forward scan if room remains.
  357          */
  358 more:
  359         while (ib != 0 && pageout_count < vm_pageout_page_count) {
  360                 if (ib > pindex) {
  361                         ib = 0;
  362                         break;
  363                 }
  364                 if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
  365                         ib = 0;
  366                         break;
  367                 }
  368                 vm_page_test_dirty(p);
  369                 if (p->dirty == 0) {
  370                         ib = 0;
  371                         break;
  372                 }
  373                 vm_page_lock(p);
  374                 if (vm_page_held(p) || !vm_page_in_laundry(p)) {
  375                         vm_page_unlock(p);
  376                         ib = 0;
  377                         break;
  378                 }
  379                 pmap_remove_write(p);
  380                 vm_page_unlock(p);
  381                 mc[--page_base] = pb = p;
  382                 ++pageout_count;
  383                 ++ib;
  384 
  385                 /*
  386                  * We are at an alignment boundary.  Stop here, and switch
  387                  * directions.  Do not clear ib.
  388                  */
  389                 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
  390                         break;
  391         }
  392         while (pageout_count < vm_pageout_page_count && 
  393             pindex + is < object->size) {
  394                 if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
  395                         break;
  396                 vm_page_test_dirty(p);
  397                 if (p->dirty == 0)
  398                         break;
  399                 vm_page_lock(p);
  400                 if (vm_page_held(p) || !vm_page_in_laundry(p)) {
  401                         vm_page_unlock(p);
  402                         break;
  403                 }
  404                 pmap_remove_write(p);
  405                 vm_page_unlock(p);
  406                 mc[page_base + pageout_count] = ps = p;
  407                 ++pageout_count;
  408                 ++is;
  409         }
  410 
  411         /*
  412          * If we exhausted our forward scan, continue with the reverse scan
  413          * when possible, even past an alignment boundary.  This catches
  414          * boundary conditions.
  415          */
  416         if (ib != 0 && pageout_count < vm_pageout_page_count)
  417                 goto more;
  418 
  419         return (vm_pageout_flush(&mc[page_base], pageout_count,
  420             VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
  421 }
  422 
  423 /*
  424  * vm_pageout_flush() - launder the given pages
  425  *
  426  *      The given pages are laundered.  Note that we setup for the start of
  427  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
  428  *      reference count all in here rather then in the parent.  If we want
  429  *      the parent to do more sophisticated things we may have to change
  430  *      the ordering.
  431  *
  432  *      Returned runlen is the count of pages between mreq and first
  433  *      page after mreq with status VM_PAGER_AGAIN.
  434  *      *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
  435  *      for any page in runlen set.
  436  */
  437 int
  438 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
  439     boolean_t *eio)
  440 {
  441         vm_object_t object = mc[0]->object;
  442         int pageout_status[count];
  443         int numpagedout = 0;
  444         int i, runlen;
  445 
  446         VM_OBJECT_ASSERT_WLOCKED(object);
  447 
  448         /*
  449          * Initiate I/O.  Mark the pages busy and verify that they're valid
  450          * and read-only.
  451          *
  452          * We do not have to fixup the clean/dirty bits here... we can
  453          * allow the pager to do it after the I/O completes.
  454          *
  455          * NOTE! mc[i]->dirty may be partial or fragmented due to an
  456          * edge case with file fragments.
  457          */
  458         for (i = 0; i < count; i++) {
  459                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
  460                     ("vm_pageout_flush: partially invalid page %p index %d/%d",
  461                         mc[i], i, count));
  462                 KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
  463                     ("vm_pageout_flush: writeable page %p", mc[i]));
  464                 vm_page_sbusy(mc[i]);
  465         }
  466         vm_object_pip_add(object, count);
  467 
  468         vm_pager_put_pages(object, mc, count, flags, pageout_status);
  469 
  470         runlen = count - mreq;
  471         if (eio != NULL)
  472                 *eio = FALSE;
  473         for (i = 0; i < count; i++) {
  474                 vm_page_t mt = mc[i];
  475 
  476                 KASSERT(pageout_status[i] == VM_PAGER_PEND ||
  477                     !pmap_page_is_write_mapped(mt),
  478                     ("vm_pageout_flush: page %p is not write protected", mt));
  479                 switch (pageout_status[i]) {
  480                 case VM_PAGER_OK:
  481                         vm_page_lock(mt);
  482                         if (vm_page_in_laundry(mt))
  483                                 vm_page_deactivate_noreuse(mt);
  484                         vm_page_unlock(mt);
  485                         /* FALLTHROUGH */
  486                 case VM_PAGER_PEND:
  487                         numpagedout++;
  488                         break;
  489                 case VM_PAGER_BAD:
  490                         /*
  491                          * The page is outside the object's range.  We pretend
  492                          * that the page out worked and clean the page, so the
  493                          * changes will be lost if the page is reclaimed by
  494                          * the page daemon.
  495                          */
  496                         vm_page_undirty(mt);
  497                         vm_page_lock(mt);
  498                         if (vm_page_in_laundry(mt))
  499                                 vm_page_deactivate_noreuse(mt);
  500                         vm_page_unlock(mt);
  501                         break;
  502                 case VM_PAGER_ERROR:
  503                 case VM_PAGER_FAIL:
  504                         /*
  505                          * If the page couldn't be paged out to swap because the
  506                          * pager wasn't able to find space, place the page in
  507                          * the PQ_UNSWAPPABLE holding queue.  This is an
  508                          * optimization that prevents the page daemon from
  509                          * wasting CPU cycles on pages that cannot be reclaimed
  510                          * becase no swap device is configured.
  511                          *
  512                          * Otherwise, reactivate the page so that it doesn't
  513                          * clog the laundry and inactive queues.  (We will try
  514                          * paging it out again later.)
  515                          */
  516                         vm_page_lock(mt);
  517                         if (object->type == OBJT_SWAP &&
  518                             pageout_status[i] == VM_PAGER_FAIL) {
  519                                 vm_page_unswappable(mt);
  520                                 numpagedout++;
  521                         } else
  522                                 vm_page_activate(mt);
  523                         vm_page_unlock(mt);
  524                         if (eio != NULL && i >= mreq && i - mreq < runlen)
  525                                 *eio = TRUE;
  526                         break;
  527                 case VM_PAGER_AGAIN:
  528                         if (i >= mreq && i - mreq < runlen)
  529                                 runlen = i - mreq;
  530                         break;
  531                 }
  532 
  533                 /*
  534                  * If the operation is still going, leave the page busy to
  535                  * block all other accesses. Also, leave the paging in
  536                  * progress indicator set so that we don't attempt an object
  537                  * collapse.
  538                  */
  539                 if (pageout_status[i] != VM_PAGER_PEND) {
  540                         vm_object_pip_wakeup(object);
  541                         vm_page_sunbusy(mt);
  542                 }
  543         }
  544         if (prunlen != NULL)
  545                 *prunlen = runlen;
  546         return (numpagedout);
  547 }
  548 
  549 static void
  550 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
  551 {
  552 
  553         atomic_store_rel_int(&swapdev_enabled, 1);
  554 }
  555 
  556 static void
  557 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
  558 {
  559 
  560         if (swap_pager_nswapdev() == 1)
  561                 atomic_store_rel_int(&swapdev_enabled, 0);
  562 }
  563 
  564 /*
  565  * Attempt to acquire all of the necessary locks to launder a page and
  566  * then call through the clustering layer to PUTPAGES.  Wait a short
  567  * time for a vnode lock.
  568  *
  569  * Requires the page and object lock on entry, releases both before return.
  570  * Returns 0 on success and an errno otherwise.
  571  */
  572 static int
  573 vm_pageout_clean(vm_page_t m, int *numpagedout)
  574 {
  575         struct vnode *vp;
  576         struct mount *mp;
  577         vm_object_t object;
  578         vm_pindex_t pindex;
  579         int error, lockmode;
  580 
  581         vm_page_assert_locked(m);
  582         object = m->object;
  583         VM_OBJECT_ASSERT_WLOCKED(object);
  584         error = 0;
  585         vp = NULL;
  586         mp = NULL;
  587 
  588         /*
  589          * The object is already known NOT to be dead.   It
  590          * is possible for the vget() to block the whole
  591          * pageout daemon, but the new low-memory handling
  592          * code should prevent it.
  593          *
  594          * We can't wait forever for the vnode lock, we might
  595          * deadlock due to a vn_read() getting stuck in
  596          * vm_wait while holding this vnode.  We skip the 
  597          * vnode if we can't get it in a reasonable amount
  598          * of time.
  599          */
  600         if (object->type == OBJT_VNODE) {
  601                 vm_page_unlock(m);
  602                 vp = object->handle;
  603                 if (vp->v_type == VREG &&
  604                     vn_start_write(vp, &mp, V_NOWAIT) != 0) {
  605                         mp = NULL;
  606                         error = EDEADLK;
  607                         goto unlock_all;
  608                 }
  609                 KASSERT(mp != NULL,
  610                     ("vp %p with NULL v_mount", vp));
  611                 vm_object_reference_locked(object);
  612                 pindex = m->pindex;
  613                 VM_OBJECT_WUNLOCK(object);
  614                 lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
  615                     LK_SHARED : LK_EXCLUSIVE;
  616                 if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
  617                         vp = NULL;
  618                         error = EDEADLK;
  619                         goto unlock_mp;
  620                 }
  621                 VM_OBJECT_WLOCK(object);
  622 
  623                 /*
  624                  * Ensure that the object and vnode were not disassociated
  625                  * while locks were dropped.
  626                  */
  627                 if (vp->v_object != object) {
  628                         error = ENOENT;
  629                         goto unlock_all;
  630                 }
  631                 vm_page_lock(m);
  632 
  633                 /*
  634                  * While the object and page were unlocked, the page
  635                  * may have been:
  636                  * (1) moved to a different queue,
  637                  * (2) reallocated to a different object,
  638                  * (3) reallocated to a different offset, or
  639                  * (4) cleaned.
  640                  */
  641                 if (!vm_page_in_laundry(m) || m->object != object ||
  642                     m->pindex != pindex || m->dirty == 0) {
  643                         vm_page_unlock(m);
  644                         error = ENXIO;
  645                         goto unlock_all;
  646                 }
  647 
  648                 /*
  649                  * The page may have been busied or referenced while the object
  650                  * and page locks were released.
  651                  */
  652                 if (vm_page_busied(m) || vm_page_held(m)) {
  653                         vm_page_unlock(m);
  654                         error = EBUSY;
  655                         goto unlock_all;
  656                 }
  657         }
  658 
  659         /*
  660          * If a page is dirty, then it is either being washed
  661          * (but not yet cleaned) or it is still in the
  662          * laundry.  If it is still in the laundry, then we
  663          * start the cleaning operation. 
  664          */
  665         if ((*numpagedout = vm_pageout_cluster(m)) == 0)
  666                 error = EIO;
  667 
  668 unlock_all:
  669         VM_OBJECT_WUNLOCK(object);
  670 
  671 unlock_mp:
  672         vm_page_lock_assert(m, MA_NOTOWNED);
  673         if (mp != NULL) {
  674                 if (vp != NULL)
  675                         vput(vp);
  676                 vm_object_deallocate(object);
  677                 vn_finished_write(mp);
  678         }
  679 
  680         return (error);
  681 }
  682 
  683 /*
  684  * Attempt to launder the specified number of pages.
  685  *
  686  * Returns the number of pages successfully laundered.
  687  */
  688 static int
  689 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
  690 {
  691         struct scan_state ss;
  692         struct vm_pagequeue *pq;
  693         struct mtx *mtx;
  694         vm_object_t object;
  695         vm_page_t m, marker;
  696         int act_delta, error, numpagedout, queue, starting_target;
  697         int vnodes_skipped;
  698         bool pageout_ok;
  699 
  700         mtx = NULL;
  701         object = NULL;
  702         starting_target = launder;
  703         vnodes_skipped = 0;
  704 
  705         /*
  706          * Scan the laundry queues for pages eligible to be laundered.  We stop
  707          * once the target number of dirty pages have been laundered, or once
  708          * we've reached the end of the queue.  A single iteration of this loop
  709          * may cause more than one page to be laundered because of clustering.
  710          *
  711          * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
  712          * swap devices are configured.
  713          */
  714         if (atomic_load_acq_int(&swapdev_enabled))
  715                 queue = PQ_UNSWAPPABLE;
  716         else
  717                 queue = PQ_LAUNDRY;
  718 
  719 scan:
  720         marker = &vmd->vmd_markers[queue];
  721         pq = &vmd->vmd_pagequeues[queue];
  722         vm_pagequeue_lock(pq);
  723         vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
  724         while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
  725                 if (__predict_false((m->flags & PG_MARKER) != 0))
  726                         continue;
  727 
  728                 vm_page_change_lock(m, &mtx);
  729 
  730 recheck:
  731                 /*
  732                  * The page may have been disassociated from the queue
  733                  * while locks were dropped.
  734                  */
  735                 if (vm_page_queue(m) != queue)
  736                         continue;
  737 
  738                 /*
  739                  * A requeue was requested, so this page gets a second
  740                  * chance.
  741                  */
  742                 if ((m->aflags & PGA_REQUEUE) != 0) {
  743                         vm_page_requeue(m);
  744                         continue;
  745                 }
  746 
  747                 /*
  748                  * Held pages are essentially stuck in the queue.
  749                  *
  750                  * Wired pages may not be freed.  Complete their removal
  751                  * from the queue now to avoid needless revisits during
  752                  * future scans.
  753                  */
  754                 if (m->hold_count != 0)
  755                         continue;
  756                 if (m->wire_count != 0) {
  757                         vm_page_dequeue_deferred(m);
  758                         continue;
  759                 }
  760 
  761                 if (object != m->object) {
  762                         if (object != NULL)
  763                                 VM_OBJECT_WUNLOCK(object);
  764                         object = m->object;
  765                         if (!VM_OBJECT_TRYWLOCK(object)) {
  766                                 mtx_unlock(mtx);
  767                                 /* Depends on type-stability. */
  768                                 VM_OBJECT_WLOCK(object);
  769                                 mtx_lock(mtx);
  770                                 goto recheck;
  771                         }
  772                 }
  773 
  774                 if (vm_page_busied(m))
  775                         continue;
  776 
  777                 /*
  778                  * Invalid pages can be easily freed.  They cannot be
  779                  * mapped; vm_page_free() asserts this.
  780                  */
  781                 if (m->valid == 0)
  782                         goto free_page;
  783 
  784                 /*
  785                  * If the page has been referenced and the object is not dead,
  786                  * reactivate or requeue the page depending on whether the
  787                  * object is mapped.
  788                  *
  789                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
  790                  * that a reference from a concurrently destroyed mapping is
  791                  * observed here and now.
  792                  */
  793                 if (object->ref_count != 0)
  794                         act_delta = pmap_ts_referenced(m);
  795                 else {
  796                         KASSERT(!pmap_page_is_mapped(m),
  797                             ("page %p is mapped", m));
  798                         act_delta = 0;
  799                 }
  800                 if ((m->aflags & PGA_REFERENCED) != 0) {
  801                         vm_page_aflag_clear(m, PGA_REFERENCED);
  802                         act_delta++;
  803                 }
  804                 if (act_delta != 0) {
  805                         if (object->ref_count != 0) {
  806                                 VM_CNT_INC(v_reactivated);
  807                                 vm_page_activate(m);
  808 
  809                                 /*
  810                                  * Increase the activation count if the page
  811                                  * was referenced while in the laundry queue.
  812                                  * This makes it less likely that the page will
  813                                  * be returned prematurely to the inactive
  814                                  * queue.
  815                                  */
  816                                 m->act_count += act_delta + ACT_ADVANCE;
  817 
  818                                 /*
  819                                  * If this was a background laundering, count
  820                                  * activated pages towards our target.  The
  821                                  * purpose of background laundering is to ensure
  822                                  * that pages are eventually cycled through the
  823                                  * laundry queue, and an activation is a valid
  824                                  * way out.
  825                                  */
  826                                 if (!in_shortfall)
  827                                         launder--;
  828                                 continue;
  829                         } else if ((object->flags & OBJ_DEAD) == 0) {
  830                                 vm_page_requeue(m);
  831                                 continue;
  832                         }
  833                 }
  834 
  835                 /*
  836                  * If the page appears to be clean at the machine-independent
  837                  * layer, then remove all of its mappings from the pmap in
  838                  * anticipation of freeing it.  If, however, any of the page's
  839                  * mappings allow write access, then the page may still be
  840                  * modified until the last of those mappings are removed.
  841                  */
  842                 if (object->ref_count != 0) {
  843                         vm_page_test_dirty(m);
  844                         if (m->dirty == 0)
  845                                 pmap_remove_all(m);
  846                 }
  847 
  848                 /*
  849                  * Clean pages are freed, and dirty pages are paged out unless
  850                  * they belong to a dead object.  Requeueing dirty pages from
  851                  * dead objects is pointless, as they are being paged out and
  852                  * freed by the thread that destroyed the object.
  853                  */
  854                 if (m->dirty == 0) {
  855 free_page:
  856                         vm_page_free(m);
  857                         VM_CNT_INC(v_dfree);
  858                 } else if ((object->flags & OBJ_DEAD) == 0) {
  859                         if (object->type != OBJT_SWAP &&
  860                             object->type != OBJT_DEFAULT)
  861                                 pageout_ok = true;
  862                         else if (disable_swap_pageouts)
  863                                 pageout_ok = false;
  864                         else
  865                                 pageout_ok = true;
  866                         if (!pageout_ok) {
  867                                 vm_page_requeue(m);
  868                                 continue;
  869                         }
  870 
  871                         /*
  872                          * Form a cluster with adjacent, dirty pages from the
  873                          * same object, and page out that entire cluster.
  874                          *
  875                          * The adjacent, dirty pages must also be in the
  876                          * laundry.  However, their mappings are not checked
  877                          * for new references.  Consequently, a recently
  878                          * referenced page may be paged out.  However, that
  879                          * page will not be prematurely reclaimed.  After page
  880                          * out, the page will be placed in the inactive queue,
  881                          * where any new references will be detected and the
  882                          * page reactivated.
  883                          */
  884                         error = vm_pageout_clean(m, &numpagedout);
  885                         if (error == 0) {
  886                                 launder -= numpagedout;
  887                                 ss.scanned += numpagedout;
  888                         } else if (error == EDEADLK) {
  889                                 pageout_lock_miss++;
  890                                 vnodes_skipped++;
  891                         }
  892                         mtx = NULL;
  893                         object = NULL;
  894                 }
  895         }
  896         if (mtx != NULL) {
  897                 mtx_unlock(mtx);
  898                 mtx = NULL;
  899         }
  900         if (object != NULL) {
  901                 VM_OBJECT_WUNLOCK(object);
  902                 object = NULL;
  903         }
  904         vm_pagequeue_lock(pq);
  905         vm_pageout_end_scan(&ss);
  906         vm_pagequeue_unlock(pq);
  907 
  908         if (launder > 0 && queue == PQ_UNSWAPPABLE) {
  909                 queue = PQ_LAUNDRY;
  910                 goto scan;
  911         }
  912 
  913         /*
  914          * Wakeup the sync daemon if we skipped a vnode in a writeable object
  915          * and we didn't launder enough pages.
  916          */
  917         if (vnodes_skipped > 0 && launder > 0)
  918                 (void)speedup_syncer();
  919 
  920         return (starting_target - launder);
  921 }
  922 
  923 /*
  924  * Compute the integer square root.
  925  */
  926 static u_int
  927 isqrt(u_int num)
  928 {
  929         u_int bit, root, tmp;
  930 
  931         bit = 1u << ((NBBY * sizeof(u_int)) - 2);
  932         while (bit > num)
  933                 bit >>= 2;
  934         root = 0;
  935         while (bit != 0) {
  936                 tmp = root + bit;
  937                 root >>= 1;
  938                 if (num >= tmp) {
  939                         num -= tmp;
  940                         root += bit;
  941                 }
  942                 bit >>= 2;
  943         }
  944         return (root);
  945 }
  946 
  947 /*
  948  * Perform the work of the laundry thread: periodically wake up and determine
  949  * whether any pages need to be laundered.  If so, determine the number of pages
  950  * that need to be laundered, and launder them.
  951  */
  952 static void
  953 vm_pageout_laundry_worker(void *arg)
  954 {
  955         struct vm_domain *vmd;
  956         struct vm_pagequeue *pq;
  957         uint64_t nclean, ndirty, nfreed;
  958         int domain, last_target, launder, shortfall, shortfall_cycle, target;
  959         bool in_shortfall;
  960 
  961         domain = (uintptr_t)arg;
  962         vmd = VM_DOMAIN(domain);
  963         pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
  964         KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
  965 
  966         shortfall = 0;
  967         in_shortfall = false;
  968         shortfall_cycle = 0;
  969         last_target = target = 0;
  970         nfreed = 0;
  971 
  972         /*
  973          * Calls to these handlers are serialized by the swap syscall lock.
  974          */
  975         (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,
  976             EVENTHANDLER_PRI_ANY);
  977         (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,
  978             EVENTHANDLER_PRI_ANY);
  979 
  980         /*
  981          * The pageout laundry worker is never done, so loop forever.
  982          */
  983         for (;;) {
  984                 KASSERT(target >= 0, ("negative target %d", target));
  985                 KASSERT(shortfall_cycle >= 0,
  986                     ("negative cycle %d", shortfall_cycle));
  987                 launder = 0;
  988 
  989                 /*
  990                  * First determine whether we need to launder pages to meet a
  991                  * shortage of free pages.
  992                  */
  993                 if (shortfall > 0) {
  994                         in_shortfall = true;
  995                         shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
  996                         target = shortfall;
  997                 } else if (!in_shortfall)
  998                         goto trybackground;
  999                 else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) {
 1000                         /*
 1001                          * We recently entered shortfall and began laundering
 1002                          * pages.  If we have completed that laundering run
 1003                          * (and we are no longer in shortfall) or we have met
 1004                          * our laundry target through other activity, then we
 1005                          * can stop laundering pages.
 1006                          */
 1007                         in_shortfall = false;
 1008                         target = 0;
 1009                         goto trybackground;
 1010                 }
 1011                 launder = target / shortfall_cycle--;
 1012                 goto dolaundry;
 1013 
 1014                 /*
 1015                  * There's no immediate need to launder any pages; see if we
 1016                  * meet the conditions to perform background laundering:
 1017                  *
 1018                  * 1. The ratio of dirty to clean inactive pages exceeds the
 1019                  *    background laundering threshold, or
 1020                  * 2. we haven't yet reached the target of the current
 1021                  *    background laundering run.
 1022                  *
 1023                  * The background laundering threshold is not a constant.
 1024                  * Instead, it is a slowly growing function of the number of
 1025                  * clean pages freed by the page daemon since the last
 1026                  * background laundering.  Thus, as the ratio of dirty to
 1027                  * clean inactive pages grows, the amount of memory pressure
 1028                  * required to trigger laundering decreases.  We ensure
 1029                  * that the threshold is non-zero after an inactive queue
 1030                  * scan, even if that scan failed to free a single clean page.
 1031                  */
 1032 trybackground:
 1033                 nclean = vmd->vmd_free_count +
 1034                     vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;
 1035                 ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;
 1036                 if (target == 0 && ndirty * isqrt(howmany(nfreed + 1,
 1037                     vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) {
 1038                         target = vmd->vmd_background_launder_target;
 1039                 }
 1040 
 1041                 /*
 1042                  * We have a non-zero background laundering target.  If we've
 1043                  * laundered up to our maximum without observing a page daemon
 1044                  * request, just stop.  This is a safety belt that ensures we
 1045                  * don't launder an excessive amount if memory pressure is low
 1046                  * and the ratio of dirty to clean pages is large.  Otherwise,
 1047                  * proceed at the background laundering rate.
 1048                  */
 1049                 if (target > 0) {
 1050                         if (nfreed > 0) {
 1051                                 nfreed = 0;
 1052                                 last_target = target;
 1053                         } else if (last_target - target >=
 1054                             vm_background_launder_max * PAGE_SIZE / 1024) {
 1055                                 target = 0;
 1056                         }
 1057                         launder = vm_background_launder_rate * PAGE_SIZE / 1024;
 1058                         launder /= VM_LAUNDER_RATE;
 1059                         if (launder > target)
 1060                                 launder = target;
 1061                 }
 1062 
 1063 dolaundry:
 1064                 if (launder > 0) {
 1065                         /*
 1066                          * Because of I/O clustering, the number of laundered
 1067                          * pages could exceed "target" by the maximum size of
 1068                          * a cluster minus one. 
 1069                          */
 1070                         target -= min(vm_pageout_launder(vmd, launder,
 1071                             in_shortfall), target);
 1072                         pause("laundp", hz / VM_LAUNDER_RATE);
 1073                 }
 1074 
 1075                 /*
 1076                  * If we're not currently laundering pages and the page daemon
 1077                  * hasn't posted a new request, sleep until the page daemon
 1078                  * kicks us.
 1079                  */
 1080                 vm_pagequeue_lock(pq);
 1081                 if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)
 1082                         (void)mtx_sleep(&vmd->vmd_laundry_request,
 1083                             vm_pagequeue_lockptr(pq), PVM, "launds", 0);
 1084 
 1085                 /*
 1086                  * If the pagedaemon has indicated that it's in shortfall, start
 1087                  * a shortfall laundering unless we're already in the middle of
 1088                  * one.  This may preempt a background laundering.
 1089                  */
 1090                 if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&
 1091                     (!in_shortfall || shortfall_cycle == 0)) {
 1092                         shortfall = vm_laundry_target(vmd) +
 1093                             vmd->vmd_pageout_deficit;
 1094                         target = 0;
 1095                 } else
 1096                         shortfall = 0;
 1097 
 1098                 if (target == 0)
 1099                         vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;
 1100                 nfreed += vmd->vmd_clean_pages_freed;
 1101                 vmd->vmd_clean_pages_freed = 0;
 1102                 vm_pagequeue_unlock(pq);
 1103         }
 1104 }
 1105 
 1106 /*
 1107  * Compute the number of pages we want to try to move from the
 1108  * active queue to either the inactive or laundry queue.
 1109  *
 1110  * When scanning active pages during a shortage, we make clean pages
 1111  * count more heavily towards the page shortage than dirty pages.
 1112  * This is because dirty pages must be laundered before they can be
 1113  * reused and thus have less utility when attempting to quickly
 1114  * alleviate a free page shortage.  However, this weighting also
 1115  * causes the scan to deactivate dirty pages more aggressively,
 1116  * improving the effectiveness of clustering.
 1117  */
 1118 static int
 1119 vm_pageout_active_target(struct vm_domain *vmd)
 1120 {
 1121         int shortage;
 1122 
 1123         shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) -
 1124             (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt +
 1125             vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight);
 1126         shortage *= act_scan_laundry_weight;
 1127         return (shortage);
 1128 }
 1129 
 1130 /*
 1131  * Scan the active queue.  If there is no shortage of inactive pages, scan a
 1132  * small portion of the queue in order to maintain quasi-LRU.
 1133  */
 1134 static void
 1135 vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
 1136 {
 1137         struct scan_state ss;
 1138         struct mtx *mtx;
 1139         vm_page_t m, marker;
 1140         struct vm_pagequeue *pq;
 1141         long min_scan;
 1142         int act_delta, max_scan, scan_tick;
 1143 
 1144         marker = &vmd->vmd_markers[PQ_ACTIVE];
 1145         pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 1146         vm_pagequeue_lock(pq);
 1147 
 1148         /*
 1149          * If we're just idle polling attempt to visit every
 1150          * active page within 'update_period' seconds.
 1151          */
 1152         scan_tick = ticks;
 1153         if (vm_pageout_update_period != 0) {
 1154                 min_scan = pq->pq_cnt;
 1155                 min_scan *= scan_tick - vmd->vmd_last_active_scan;
 1156                 min_scan /= hz * vm_pageout_update_period;
 1157         } else
 1158                 min_scan = 0;
 1159         if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0))
 1160                 vmd->vmd_last_active_scan = scan_tick;
 1161 
 1162         /*
 1163          * Scan the active queue for pages that can be deactivated.  Update
 1164          * the per-page activity counter and use it to identify deactivation
 1165          * candidates.  Held pages may be deactivated.
 1166          *
 1167          * To avoid requeuing each page that remains in the active queue, we
 1168          * implement the CLOCK algorithm.  To keep the implementation of the
 1169          * enqueue operation consistent for all page queues, we use two hands,
 1170          * represented by marker pages. Scans begin at the first hand, which
 1171          * precedes the second hand in the queue.  When the two hands meet,
 1172          * they are moved back to the head and tail of the queue, respectively,
 1173          * and scanning resumes.
 1174          */
 1175         max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan;
 1176         mtx = NULL;
 1177 act_scan:
 1178         vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
 1179         while ((m = vm_pageout_next(&ss, false)) != NULL) {
 1180                 if (__predict_false(m == &vmd->vmd_clock[1])) {
 1181                         vm_pagequeue_lock(pq);
 1182                         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 1183                         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
 1184                         TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
 1185                             plinks.q);
 1186                         TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
 1187                             plinks.q);
 1188                         max_scan -= ss.scanned;
 1189                         vm_pageout_end_scan(&ss);
 1190                         goto act_scan;
 1191                 }
 1192                 if (__predict_false((m->flags & PG_MARKER) != 0))
 1193                         continue;
 1194 
 1195                 vm_page_change_lock(m, &mtx);
 1196 
 1197                 /*
 1198                  * The page may have been disassociated from the queue
 1199                  * while locks were dropped.
 1200                  */
 1201                 if (vm_page_queue(m) != PQ_ACTIVE)
 1202                         continue;
 1203 
 1204                 /*
 1205                  * Wired pages are dequeued lazily.
 1206                  */
 1207                 if (m->wire_count != 0) {
 1208                         vm_page_dequeue_deferred(m);
 1209                         continue;
 1210                 }
 1211 
 1212                 /*
 1213                  * Check to see "how much" the page has been used.
 1214                  *
 1215                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 1216                  * that a reference from a concurrently destroyed mapping is
 1217                  * observed here and now.
 1218                  *
 1219                  * Perform an unsynchronized object ref count check.  While
 1220                  * the page lock ensures that the page is not reallocated to
 1221                  * another object, in particular, one with unmanaged mappings
 1222                  * that cannot support pmap_ts_referenced(), two races are,
 1223                  * nonetheless, possible:
 1224                  * 1) The count was transitioning to zero, but we saw a non-
 1225                  *    zero value.  pmap_ts_referenced() will return zero
 1226                  *    because the page is not mapped.
 1227                  * 2) The count was transitioning to one, but we saw zero.
 1228                  *    This race delays the detection of a new reference.  At
 1229                  *    worst, we will deactivate and reactivate the page.
 1230                  */
 1231                 if (m->object->ref_count != 0)
 1232                         act_delta = pmap_ts_referenced(m);
 1233                 else
 1234                         act_delta = 0;
 1235                 if ((m->aflags & PGA_REFERENCED) != 0) {
 1236                         vm_page_aflag_clear(m, PGA_REFERENCED);
 1237                         act_delta++;
 1238                 }
 1239 
 1240                 /*
 1241                  * Advance or decay the act_count based on recent usage.
 1242                  */
 1243                 if (act_delta != 0) {
 1244                         m->act_count += ACT_ADVANCE + act_delta;
 1245                         if (m->act_count > ACT_MAX)
 1246                                 m->act_count = ACT_MAX;
 1247                 } else
 1248                         m->act_count -= min(m->act_count, ACT_DECLINE);
 1249 
 1250                 if (m->act_count == 0) {
 1251                         /*
 1252                          * When not short for inactive pages, let dirty pages go
 1253                          * through the inactive queue before moving to the
 1254                          * laundry queues.  This gives them some extra time to
 1255                          * be reactivated, potentially avoiding an expensive
 1256                          * pageout.  However, during a page shortage, the
 1257                          * inactive queue is necessarily small, and so dirty
 1258                          * pages would only spend a trivial amount of time in
 1259                          * the inactive queue.  Therefore, we might as well
 1260                          * place them directly in the laundry queue to reduce
 1261                          * queuing overhead.
 1262                          */
 1263                         if (page_shortage <= 0)
 1264                                 vm_page_deactivate(m);
 1265                         else {
 1266                                 /*
 1267                                  * Calling vm_page_test_dirty() here would
 1268                                  * require acquisition of the object's write
 1269                                  * lock.  However, during a page shortage,
 1270                                  * directing dirty pages into the laundry
 1271                                  * queue is only an optimization and not a
 1272                                  * requirement.  Therefore, we simply rely on
 1273                                  * the opportunistic updates to the page's
 1274                                  * dirty field by the pmap.
 1275                                  */
 1276                                 if (m->dirty == 0) {
 1277                                         vm_page_deactivate(m);
 1278                                         page_shortage -=
 1279                                             act_scan_laundry_weight;
 1280                                 } else {
 1281                                         vm_page_launder(m);
 1282                                         page_shortage--;
 1283                                 }
 1284                         }
 1285                 }
 1286         }
 1287         if (mtx != NULL) {
 1288                 mtx_unlock(mtx);
 1289                 mtx = NULL;
 1290         }
 1291         vm_pagequeue_lock(pq);
 1292         TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 1293         TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
 1294         vm_pageout_end_scan(&ss);
 1295         vm_pagequeue_unlock(pq);
 1296 }
 1297 
 1298 static int
 1299 vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m)
 1300 {
 1301         struct vm_domain *vmd;
 1302 
 1303         if (m->queue != PQ_INACTIVE || (m->aflags & PGA_ENQUEUED) != 0)
 1304                 return (0);
 1305         vm_page_aflag_set(m, PGA_ENQUEUED);
 1306         if ((m->aflags & PGA_REQUEUE_HEAD) != 0) {
 1307                 vmd = vm_pagequeue_domain(m);
 1308                 TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
 1309                 vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 1310         } else if ((m->aflags & PGA_REQUEUE) != 0) {
 1311                 TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q);
 1312                 vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 1313         } else
 1314                 TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q);
 1315         return (1);
 1316 }
 1317 
 1318 /*
 1319  * Re-add stuck pages to the inactive queue.  We will examine them again
 1320  * during the next scan.  If the queue state of a page has changed since
 1321  * it was physically removed from the page queue in
 1322  * vm_pageout_collect_batch(), don't do anything with that page.
 1323  */
 1324 static void
 1325 vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
 1326     vm_page_t m)
 1327 {
 1328         struct vm_pagequeue *pq;
 1329         int delta;
 1330 
 1331         delta = 0;
 1332         pq = ss->pq;
 1333 
 1334         if (m != NULL) {
 1335                 if (vm_batchqueue_insert(bq, m))
 1336                         return;
 1337                 vm_pagequeue_lock(pq);
 1338                 delta += vm_pageout_reinsert_inactive_page(ss, m);
 1339         } else
 1340                 vm_pagequeue_lock(pq);
 1341         while ((m = vm_batchqueue_pop(bq)) != NULL)
 1342                 delta += vm_pageout_reinsert_inactive_page(ss, m);
 1343         vm_pagequeue_cnt_add(pq, delta);
 1344         vm_pagequeue_unlock(pq);
 1345         vm_batchqueue_init(bq);
 1346 }
 1347 
 1348 /*
 1349  * Attempt to reclaim the requested number of pages from the inactive queue.
 1350  * Returns true if the shortage was addressed.
 1351  */
 1352 static int
 1353 vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
 1354     int *addl_shortage)
 1355 {
 1356         struct scan_state ss;
 1357         struct vm_batchqueue rq;
 1358         struct mtx *mtx;
 1359         vm_page_t m, marker;
 1360         struct vm_pagequeue *pq;
 1361         vm_object_t object;
 1362         int act_delta, addl_page_shortage, deficit, page_shortage;
 1363         int starting_page_shortage;
 1364 
 1365         /*
 1366          * The addl_page_shortage is an estimate of the number of temporarily
 1367          * stuck pages in the inactive queue.  In other words, the
 1368          * number of pages from the inactive count that should be
 1369          * discounted in setting the target for the active queue scan.
 1370          */
 1371         addl_page_shortage = 0;
 1372 
 1373         /*
 1374          * vmd_pageout_deficit counts the number of pages requested in
 1375          * allocations that failed because of a free page shortage.  We assume
 1376          * that the allocations will be reattempted and thus include the deficit
 1377          * in our scan target.
 1378          */
 1379         deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
 1380         starting_page_shortage = page_shortage = shortage + deficit;
 1381 
 1382         mtx = NULL;
 1383         object = NULL;
 1384         vm_batchqueue_init(&rq);
 1385 
 1386         /*
 1387          * Start scanning the inactive queue for pages that we can free.  The
 1388          * scan will stop when we reach the target or we have scanned the
 1389          * entire queue.  (Note that m->act_count is not used to make
 1390          * decisions for the inactive queue, only for the active queue.)
 1391          */
 1392         marker = &vmd->vmd_markers[PQ_INACTIVE];
 1393         pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 1394         vm_pagequeue_lock(pq);
 1395         vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 1396         while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) {
 1397                 KASSERT((m->flags & PG_MARKER) == 0,
 1398                     ("marker page %p was dequeued", m));
 1399 
 1400                 vm_page_change_lock(m, &mtx);
 1401 
 1402 recheck:
 1403                 /*
 1404                  * The page may have been disassociated from the queue
 1405                  * while locks were dropped.
 1406                  */
 1407                 if (vm_page_queue(m) != PQ_INACTIVE) {
 1408                         addl_page_shortage++;
 1409                         continue;
 1410                 }
 1411 
 1412                 /*
 1413                  * The page was re-enqueued after the page queue lock was
 1414                  * dropped, or a requeue was requested.  This page gets a second
 1415                  * chance.
 1416                  */
 1417                 if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE |
 1418                     PGA_REQUEUE_HEAD)) != 0)
 1419                         goto reinsert;
 1420 
 1421                 /*
 1422                  * Held pages are essentially stuck in the queue.  So,
 1423                  * they ought to be discounted from the inactive count.
 1424                  * See the description of addl_page_shortage above.
 1425                  *
 1426                  * Wired pages may not be freed.  Complete their removal
 1427                  * from the queue now to avoid needless revisits during
 1428                  * future scans.
 1429                  */
 1430                 if (m->hold_count != 0) {
 1431                         addl_page_shortage++;
 1432                         goto reinsert;
 1433                 }
 1434                 if (m->wire_count != 0) {
 1435                         vm_page_dequeue_deferred(m);
 1436                         continue;
 1437                 }
 1438 
 1439                 if (object != m->object) {
 1440                         if (object != NULL)
 1441                                 VM_OBJECT_WUNLOCK(object);
 1442                         object = m->object;
 1443                         if (!VM_OBJECT_TRYWLOCK(object)) {
 1444                                 mtx_unlock(mtx);
 1445                                 /* Depends on type-stability. */
 1446                                 VM_OBJECT_WLOCK(object);
 1447                                 mtx_lock(mtx);
 1448                                 goto recheck;
 1449                         }
 1450                 }
 1451 
 1452                 if (vm_page_busied(m)) {
 1453                         /*
 1454                          * Don't mess with busy pages.  Leave them at
 1455                          * the front of the queue.  Most likely, they
 1456                          * are being paged out and will leave the
 1457                          * queue shortly after the scan finishes.  So,
 1458                          * they ought to be discounted from the
 1459                          * inactive count.
 1460                          */
 1461                         addl_page_shortage++;
 1462                         goto reinsert;
 1463                 }
 1464 
 1465                 /*
 1466                  * Invalid pages can be easily freed. They cannot be
 1467                  * mapped, vm_page_free() asserts this.
 1468                  */
 1469                 if (m->valid == 0)
 1470                         goto free_page;
 1471 
 1472                 /*
 1473                  * If the page has been referenced and the object is not dead,
 1474                  * reactivate or requeue the page depending on whether the
 1475                  * object is mapped.
 1476                  *
 1477                  * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 1478                  * that a reference from a concurrently destroyed mapping is
 1479                  * observed here and now.
 1480                  */
 1481                 if (object->ref_count != 0)
 1482                         act_delta = pmap_ts_referenced(m);
 1483                 else {
 1484                         KASSERT(!pmap_page_is_mapped(m),
 1485                             ("page %p is mapped", m));
 1486                         act_delta = 0;
 1487                 }
 1488                 if ((m->aflags & PGA_REFERENCED) != 0) {
 1489                         vm_page_aflag_clear(m, PGA_REFERENCED);
 1490                         act_delta++;
 1491                 }
 1492                 if (act_delta != 0) {
 1493                         if (object->ref_count != 0) {
 1494                                 VM_CNT_INC(v_reactivated);
 1495                                 vm_page_activate(m);
 1496 
 1497                                 /*
 1498                                  * Increase the activation count if the page
 1499                                  * was referenced while in the inactive queue.
 1500                                  * This makes it less likely that the page will
 1501                                  * be returned prematurely to the inactive
 1502                                  * queue.
 1503                                  */
 1504                                 m->act_count += act_delta + ACT_ADVANCE;
 1505                                 continue;
 1506                         } else if ((object->flags & OBJ_DEAD) == 0) {
 1507                                 vm_page_aflag_set(m, PGA_REQUEUE);
 1508                                 goto reinsert;
 1509                         }
 1510                 }
 1511 
 1512                 /*
 1513                  * If the page appears to be clean at the machine-independent
 1514                  * layer, then remove all of its mappings from the pmap in
 1515                  * anticipation of freeing it.  If, however, any of the page's
 1516                  * mappings allow write access, then the page may still be
 1517                  * modified until the last of those mappings are removed.
 1518                  */
 1519                 if (object->ref_count != 0) {
 1520                         vm_page_test_dirty(m);
 1521                         if (m->dirty == 0)
 1522                                 pmap_remove_all(m);
 1523                 }
 1524 
 1525                 /*
 1526                  * Clean pages can be freed, but dirty pages must be sent back
 1527                  * to the laundry, unless they belong to a dead object.
 1528                  * Requeueing dirty pages from dead objects is pointless, as
 1529                  * they are being paged out and freed by the thread that
 1530                  * destroyed the object.
 1531                  */
 1532                 if (m->dirty == 0) {
 1533 free_page:
 1534                         /*
 1535                          * Because we dequeued the page and have already
 1536                          * checked for concurrent dequeue and enqueue
 1537                          * requests, we can safely disassociate the page
 1538                          * from the inactive queue.
 1539                          */
 1540                         KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
 1541                             ("page %p has queue state", m));
 1542                         m->queue = PQ_NONE;
 1543                         vm_page_free(m);
 1544                         page_shortage--;
 1545                 } else if ((object->flags & OBJ_DEAD) == 0)
 1546                         vm_page_launder(m);
 1547                 continue;
 1548 reinsert:
 1549                 vm_pageout_reinsert_inactive(&ss, &rq, m);
 1550         }
 1551         if (mtx != NULL)
 1552                 mtx_unlock(mtx);
 1553         if (object != NULL)
 1554                 VM_OBJECT_WUNLOCK(object);
 1555         vm_pageout_reinsert_inactive(&ss, &rq, NULL);
 1556         vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
 1557         vm_pagequeue_lock(pq);
 1558         vm_pageout_end_scan(&ss);
 1559         vm_pagequeue_unlock(pq);
 1560 
 1561         VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage);
 1562 
 1563         /*
 1564          * Wake up the laundry thread so that it can perform any needed
 1565          * laundering.  If we didn't meet our target, we're in shortfall and
 1566          * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
 1567          * swap devices are configured, the laundry thread has no work to do, so
 1568          * don't bother waking it up.
 1569          *
 1570          * The laundry thread uses the number of inactive queue scans elapsed
 1571          * since the last laundering to determine whether to launder again, so
 1572          * keep count.
 1573          */
 1574         if (starting_page_shortage > 0) {
 1575                 pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 1576                 vm_pagequeue_lock(pq);
 1577                 if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&
 1578                     (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) {
 1579                         if (page_shortage > 0) {
 1580                                 vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;
 1581                                 VM_CNT_INC(v_pdshortfalls);
 1582                         } else if (vmd->vmd_laundry_request !=
 1583                             VM_LAUNDRY_SHORTFALL)
 1584                                 vmd->vmd_laundry_request =
 1585                                     VM_LAUNDRY_BACKGROUND;
 1586                         wakeup(&vmd->vmd_laundry_request);
 1587                 }
 1588                 vmd->vmd_clean_pages_freed +=
 1589                     starting_page_shortage - page_shortage;
 1590                 vm_pagequeue_unlock(pq);
 1591         }
 1592 
 1593         /*
 1594          * Wakeup the swapout daemon if we didn't free the targeted number of
 1595          * pages.
 1596          */
 1597         if (page_shortage > 0)
 1598                 vm_swapout_run();
 1599 
 1600         /*
 1601          * If the inactive queue scan fails repeatedly to meet its
 1602          * target, kill the largest process.
 1603          */
 1604         vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
 1605 
 1606         /*
 1607          * Reclaim pages by swapping out idle processes, if configured to do so.
 1608          */
 1609         vm_swapout_run_idle();
 1610 
 1611         /*
 1612          * See the description of addl_page_shortage above.
 1613          */
 1614         *addl_shortage = addl_page_shortage + deficit;
 1615 
 1616         return (page_shortage <= 0);
 1617 }
 1618 
 1619 static int vm_pageout_oom_vote;
 1620 
 1621 /*
 1622  * The pagedaemon threads randlomly select one to perform the
 1623  * OOM.  Trying to kill processes before all pagedaemons
 1624  * failed to reach free target is premature.
 1625  */
 1626 static void
 1627 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
 1628     int starting_page_shortage)
 1629 {
 1630         int old_vote;
 1631 
 1632         if (starting_page_shortage <= 0 || starting_page_shortage !=
 1633             page_shortage)
 1634                 vmd->vmd_oom_seq = 0;
 1635         else
 1636                 vmd->vmd_oom_seq++;
 1637         if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 1638                 if (vmd->vmd_oom) {
 1639                         vmd->vmd_oom = FALSE;
 1640                         atomic_subtract_int(&vm_pageout_oom_vote, 1);
 1641                 }
 1642                 return;
 1643         }
 1644 
 1645         /*
 1646          * Do not follow the call sequence until OOM condition is
 1647          * cleared.
 1648          */
 1649         vmd->vmd_oom_seq = 0;
 1650 
 1651         if (vmd->vmd_oom)
 1652                 return;
 1653 
 1654         vmd->vmd_oom = TRUE;
 1655         old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
 1656         if (old_vote != vm_ndomains - 1)
 1657                 return;
 1658 
 1659         /*
 1660          * The current pagedaemon thread is the last in the quorum to
 1661          * start OOM.  Initiate the selection and signaling of the
 1662          * victim.
 1663          */
 1664         vm_pageout_oom(VM_OOM_MEM);
 1665 
 1666         /*
 1667          * After one round of OOM terror, recall our vote.  On the
 1668          * next pass, current pagedaemon would vote again if the low
 1669          * memory condition is still there, due to vmd_oom being
 1670          * false.
 1671          */
 1672         vmd->vmd_oom = FALSE;
 1673         atomic_subtract_int(&vm_pageout_oom_vote, 1);
 1674 }
 1675 
 1676 /*
 1677  * The OOM killer is the page daemon's action of last resort when
 1678  * memory allocation requests have been stalled for a prolonged period
 1679  * of time because it cannot reclaim memory.  This function computes
 1680  * the approximate number of physical pages that could be reclaimed if
 1681  * the specified address space is destroyed.
 1682  *
 1683  * Private, anonymous memory owned by the address space is the
 1684  * principal resource that we expect to recover after an OOM kill.
 1685  * Since the physical pages mapped by the address space's COW entries
 1686  * are typically shared pages, they are unlikely to be released and so
 1687  * they are not counted.
 1688  *
 1689  * To get to the point where the page daemon runs the OOM killer, its
 1690  * efforts to write-back vnode-backed pages may have stalled.  This
 1691  * could be caused by a memory allocation deadlock in the write path
 1692  * that might be resolved by an OOM kill.  Therefore, physical pages
 1693  * belonging to vnode-backed objects are counted, because they might
 1694  * be freed without being written out first if the address space holds
 1695  * the last reference to an unlinked vnode.
 1696  *
 1697  * Similarly, physical pages belonging to OBJT_PHYS objects are
 1698  * counted because the address space might hold the last reference to
 1699  * the object.
 1700  */
 1701 static long
 1702 vm_pageout_oom_pagecount(struct vmspace *vmspace)
 1703 {
 1704         vm_map_t map;
 1705         vm_map_entry_t entry;
 1706         vm_object_t obj;
 1707         long res;
 1708 
 1709         map = &vmspace->vm_map;
 1710         KASSERT(!map->system_map, ("system map"));
 1711         sx_assert(&map->lock, SA_LOCKED);
 1712         res = 0;
 1713         for (entry = map->header.next; entry != &map->header;
 1714             entry = entry->next) {
 1715                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 1716                         continue;
 1717                 obj = entry->object.vm_object;
 1718                 if (obj == NULL)
 1719                         continue;
 1720                 if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
 1721                     obj->ref_count != 1)
 1722                         continue;
 1723                 switch (obj->type) {
 1724                 case OBJT_DEFAULT:
 1725                 case OBJT_SWAP:
 1726                 case OBJT_PHYS:
 1727                 case OBJT_VNODE:
 1728                         res += obj->resident_page_count;
 1729                         break;
 1730                 }
 1731         }
 1732         return (res);
 1733 }
 1734 
 1735 void
 1736 vm_pageout_oom(int shortage)
 1737 {
 1738         struct proc *p, *bigproc;
 1739         vm_offset_t size, bigsize;
 1740         struct thread *td;
 1741         struct vmspace *vm;
 1742         bool breakout;
 1743 
 1744         /*
 1745          * We keep the process bigproc locked once we find it to keep anyone
 1746          * from messing with it; however, there is a possibility of
 1747          * deadlock if process B is bigproc and one of its child processes
 1748          * attempts to propagate a signal to B while we are waiting for A's
 1749          * lock while walking this list.  To avoid this, we don't block on
 1750          * the process lock but just skip a process if it is already locked.
 1751          */
 1752         bigproc = NULL;
 1753         bigsize = 0;
 1754         sx_slock(&allproc_lock);
 1755         FOREACH_PROC_IN_SYSTEM(p) {
 1756                 PROC_LOCK(p);
 1757 
 1758                 /*
 1759                  * If this is a system, protected or killed process, skip it.
 1760                  */
 1761                 if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 1762                     P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
 1763                     p->p_pid == 1 || P_KILLED(p) ||
 1764                     (p->p_pid < 48 && swap_pager_avail != 0)) {
 1765                         PROC_UNLOCK(p);
 1766                         continue;
 1767                 }
 1768                 /*
 1769                  * If the process is in a non-running type state,
 1770                  * don't touch it.  Check all the threads individually.
 1771                  */
 1772                 breakout = false;
 1773                 FOREACH_THREAD_IN_PROC(p, td) {
 1774                         thread_lock(td);
 1775                         if (!TD_ON_RUNQ(td) &&
 1776                             !TD_IS_RUNNING(td) &&
 1777                             !TD_IS_SLEEPING(td) &&
 1778                             !TD_IS_SUSPENDED(td) &&
 1779                             !TD_IS_SWAPPED(td)) {
 1780                                 thread_unlock(td);
 1781                                 breakout = true;
 1782                                 break;
 1783                         }
 1784                         thread_unlock(td);
 1785                 }
 1786                 if (breakout) {
 1787                         PROC_UNLOCK(p);
 1788                         continue;
 1789                 }
 1790                 /*
 1791                  * get the process size
 1792                  */
 1793                 vm = vmspace_acquire_ref(p);
 1794                 if (vm == NULL) {
 1795                         PROC_UNLOCK(p);
 1796                         continue;
 1797                 }
 1798                 _PHOLD_LITE(p);
 1799                 PROC_UNLOCK(p);
 1800                 sx_sunlock(&allproc_lock);
 1801                 if (!vm_map_trylock_read(&vm->vm_map)) {
 1802                         vmspace_free(vm);
 1803                         sx_slock(&allproc_lock);
 1804                         PRELE(p);
 1805                         continue;
 1806                 }
 1807                 size = vmspace_swap_count(vm);
 1808                 if (shortage == VM_OOM_MEM)
 1809                         size += vm_pageout_oom_pagecount(vm);
 1810                 vm_map_unlock_read(&vm->vm_map);
 1811                 vmspace_free(vm);
 1812                 sx_slock(&allproc_lock);
 1813 
 1814                 /*
 1815                  * If this process is bigger than the biggest one,
 1816                  * remember it.
 1817                  */
 1818                 if (size > bigsize) {
 1819                         if (bigproc != NULL)
 1820                                 PRELE(bigproc);
 1821                         bigproc = p;
 1822                         bigsize = size;
 1823                 } else {
 1824                         PRELE(p);
 1825                 }
 1826         }
 1827         sx_sunlock(&allproc_lock);
 1828         if (bigproc != NULL) {
 1829                 if (vm_panic_on_oom != 0)
 1830                         panic("out of swap space");
 1831                 PROC_LOCK(bigproc);
 1832                 killproc(bigproc, "out of swap space");
 1833                 sched_nice(bigproc, PRIO_MIN);
 1834                 _PRELE(bigproc);
 1835                 PROC_UNLOCK(bigproc);
 1836         }
 1837 }
 1838 
 1839 static bool
 1840 vm_pageout_lowmem(void)
 1841 {
 1842         static int lowmem_ticks = 0;
 1843         int last;
 1844 
 1845         last = atomic_load_int(&lowmem_ticks);
 1846         while ((u_int)(ticks - last) / hz >= lowmem_period) {
 1847                 if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0)
 1848                         continue;
 1849 
 1850                 /*
 1851                  * Decrease registered cache sizes.
 1852                  */
 1853                 SDT_PROBE0(vm, , , vm__lowmem_scan);
 1854                 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
 1855 
 1856                 /*
 1857                  * We do this explicitly after the caches have been
 1858                  * drained above.
 1859                  */
 1860                 uma_reclaim();
 1861                 return (true);
 1862         }
 1863         return (false);
 1864 }
 1865 
 1866 static void
 1867 vm_pageout_worker(void *arg)
 1868 {
 1869         struct vm_domain *vmd;
 1870         u_int ofree;
 1871         int addl_shortage, domain, shortage;
 1872         bool target_met;
 1873 
 1874         domain = (uintptr_t)arg;
 1875         vmd = VM_DOMAIN(domain);
 1876         shortage = 0;
 1877         target_met = true;
 1878 
 1879         /*
 1880          * XXXKIB It could be useful to bind pageout daemon threads to
 1881          * the cores belonging to the domain, from which vm_page_array
 1882          * is allocated.
 1883          */
 1884 
 1885         KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 1886         vmd->vmd_last_active_scan = ticks;
 1887 
 1888         /*
 1889          * The pageout daemon worker is never done, so loop forever.
 1890          */
 1891         while (TRUE) {
 1892                 vm_domain_pageout_lock(vmd);
 1893 
 1894                 /*
 1895                  * We need to clear wanted before we check the limits.  This
 1896                  * prevents races with wakers who will check wanted after they
 1897                  * reach the limit.
 1898                  */
 1899                 atomic_store_int(&vmd->vmd_pageout_wanted, 0);
 1900 
 1901                 /*
 1902                  * Might the page daemon need to run again?
 1903                  */
 1904                 if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 1905                         /*
 1906                          * Yes.  If the scan failed to produce enough free
 1907                          * pages, sleep uninterruptibly for some time in the
 1908                          * hope that the laundry thread will clean some pages.
 1909                          */
 1910                         vm_domain_pageout_unlock(vmd);
 1911                         if (!target_met)
 1912                                 pause("pwait", hz / VM_INACT_SCAN_RATE);
 1913                 } else {
 1914                         /*
 1915                          * No, sleep until the next wakeup or until pages
 1916                          * need to have their reference stats updated.
 1917                          */
 1918                         if (mtx_sleep(&vmd->vmd_pageout_wanted,
 1919                             vm_domain_pageout_lockptr(vmd), PDROP | PVM,
 1920                             "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 1921                                 VM_CNT_INC(v_pdwakeups);
 1922                 }
 1923 
 1924                 /* Prevent spurious wakeups by ensuring that wanted is set. */
 1925                 atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 1926 
 1927                 /*
 1928                  * Use the controller to calculate how many pages to free in
 1929                  * this interval, and scan the inactive queue.  If the lowmem
 1930                  * handlers appear to have freed up some pages, subtract the
 1931                  * difference from the inactive queue scan target.
 1932                  */
 1933                 shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
 1934                 if (shortage > 0) {
 1935                         ofree = vmd->vmd_free_count;
 1936                         if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree)
 1937                                 shortage -= min(vmd->vmd_free_count - ofree,
 1938                                     (u_int)shortage);
 1939                         target_met = vm_pageout_scan_inactive(vmd, shortage,
 1940                             &addl_shortage);
 1941                 } else
 1942                         addl_shortage = 0;
 1943 
 1944                 /*
 1945                  * Scan the active queue.  A positive value for shortage
 1946                  * indicates that we must aggressively deactivate pages to avoid
 1947                  * a shortfall.
 1948                  */
 1949                 shortage = vm_pageout_active_target(vmd) + addl_shortage;
 1950                 vm_pageout_scan_active(vmd, shortage);
 1951         }
 1952 }
 1953 
 1954 /*
 1955  *      vm_pageout_init initialises basic pageout daemon settings.
 1956  */
 1957 static void
 1958 vm_pageout_init_domain(int domain)
 1959 {
 1960         struct vm_domain *vmd;
 1961         struct sysctl_oid *oid;
 1962 
 1963         vmd = VM_DOMAIN(domain);
 1964         vmd->vmd_interrupt_free_min = 2;
 1965 
 1966         /*
 1967          * v_free_reserved needs to include enough for the largest
 1968          * swap pager structures plus enough for any pv_entry structs
 1969          * when paging. 
 1970          */
 1971         if (vmd->vmd_page_count > 1024)
 1972                 vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200;
 1973         else
 1974                 vmd->vmd_free_min = 4;
 1975         vmd->vmd_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 1976             vmd->vmd_interrupt_free_min;
 1977         vmd->vmd_free_reserved = vm_pageout_page_count +
 1978             vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768);
 1979         vmd->vmd_free_severe = vmd->vmd_free_min / 2;
 1980         vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;
 1981         vmd->vmd_free_min += vmd->vmd_free_reserved;
 1982         vmd->vmd_free_severe += vmd->vmd_free_reserved;
 1983         vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;
 1984         if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)
 1985                 vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 1986 
 1987         /*
 1988          * Set the default wakeup threshold to be 10% below the paging
 1989          * target.  This keeps the steady state out of shortfall.
 1990          */
 1991         vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 1992 
 1993         /*
 1994          * Target amount of memory to move out of the laundry queue during a
 1995          * background laundering.  This is proportional to the amount of system
 1996          * memory.
 1997          */
 1998         vmd->vmd_background_launder_target = (vmd->vmd_free_target -
 1999             vmd->vmd_free_min) / 10;
 2000 
 2001         /* Initialize the pageout daemon pid controller. */
 2002         pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
 2003             vmd->vmd_free_target, PIDCTRL_BOUND,
 2004             PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
 2005         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
 2006             "pidctrl", CTLFLAG_RD, NULL, "");
 2007         pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 2008 }
 2009 
 2010 static void
 2011 vm_pageout_init(void)
 2012 {
 2013         u_int freecount;
 2014         int i;
 2015 
 2016         /*
 2017          * Initialize some paging parameters.
 2018          */
 2019         if (vm_cnt.v_page_count < 2000)
 2020                 vm_pageout_page_count = 8;
 2021 
 2022         freecount = 0;
 2023         for (i = 0; i < vm_ndomains; i++) {
 2024                 struct vm_domain *vmd;
 2025 
 2026                 vm_pageout_init_domain(i);
 2027                 vmd = VM_DOMAIN(i);
 2028                 vm_cnt.v_free_reserved += vmd->vmd_free_reserved;
 2029                 vm_cnt.v_free_target += vmd->vmd_free_target;
 2030                 vm_cnt.v_free_min += vmd->vmd_free_min;
 2031                 vm_cnt.v_inactive_target += vmd->vmd_inactive_target;
 2032                 vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;
 2033                 vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;
 2034                 vm_cnt.v_free_severe += vmd->vmd_free_severe;
 2035                 freecount += vmd->vmd_free_count;
 2036         }
 2037 
 2038         /*
 2039          * Set interval in seconds for active scan.  We want to visit each
 2040          * page at least once every ten minutes.  This is to prevent worst
 2041          * case paging behaviors with stale active LRU.
 2042          */
 2043         if (vm_pageout_update_period == 0)
 2044                 vm_pageout_update_period = 600;
 2045 
 2046         if (vm_page_max_wired == 0)
 2047                 vm_page_max_wired = freecount / 3;
 2048 }
 2049 
 2050 /*
 2051  *     vm_pageout is the high level pageout daemon.
 2052  */
 2053 static void
 2054 vm_pageout(void)
 2055 {
 2056         struct proc *p;
 2057         struct thread *td;
 2058         int error, first, i;
 2059 
 2060         p = curproc;
 2061         td = curthread;
 2062 
 2063         swap_pager_swap_init();
 2064         for (first = -1, i = 0; i < vm_ndomains; i++) {
 2065                 if (VM_DOMAIN_EMPTY(i)) {
 2066                         if (bootverbose)
 2067                                 printf("domain %d empty; skipping pageout\n",
 2068                                     i);
 2069                         continue;
 2070                 }
 2071                 if (first == -1)
 2072                         first = i;
 2073                 else {
 2074                         error = kthread_add(vm_pageout_worker,
 2075                             (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i);
 2076                         if (error != 0)
 2077                                 panic("starting pageout for domain %d: %d\n",
 2078                                     i, error);
 2079                 }
 2080                 error = kthread_add(vm_pageout_laundry_worker,
 2081                     (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i);
 2082                 if (error != 0)
 2083                         panic("starting laundry for domain %d: %d", i, error);
 2084         }
 2085         error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma");
 2086         if (error != 0)
 2087                 panic("starting uma_reclaim helper, error %d\n", error);
 2088 
 2089         snprintf(td->td_name, sizeof(td->td_name), "dom%d", first);
 2090         vm_pageout_worker((void *)(uintptr_t)first);
 2091 }
 2092 
 2093 /*
 2094  * Perform an advisory wakeup of the page daemon.
 2095  */
 2096 void
 2097 pagedaemon_wakeup(int domain)
 2098 {
 2099         struct vm_domain *vmd;
 2100 
 2101         vmd = VM_DOMAIN(domain);
 2102         vm_domain_pageout_assert_unlocked(vmd);
 2103         if (curproc == pageproc)
 2104                 return;
 2105 
 2106         if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {
 2107                 vm_domain_pageout_lock(vmd);
 2108                 atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 2109                 wakeup(&vmd->vmd_pageout_wanted);
 2110                 vm_domain_pageout_unlock(vmd);
 2111         }
 2112 }

Cache object: 71e89ae4bfd9a3ac5416b2e45811cfac


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.