The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_pdaemon.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uvm_pdaemon.c,v 1.133 2021/04/17 21:37:21 mrg Exp $    */
    2 
    3 /*
    4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
    5  * Copyright (c) 1991, 1993, The Regents of the University of California.
    6  *
    7  * All rights reserved.
    8  *
    9  * This code is derived from software contributed to Berkeley by
   10  * The Mach Operating System project at Carnegie-Mellon University.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
   37  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
   38  *
   39  *
   40  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   41  * All rights reserved.
   42  *
   43  * Permission to use, copy, modify and distribute this software and
   44  * its documentation is hereby granted, provided that both the copyright
   45  * notice and this permission notice appear in all copies of the
   46  * software, derivative works or modified versions, and any portions
   47  * thereof, and that both notices appear in supporting documentation.
   48  *
   49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   52  *
   53  * Carnegie Mellon requests users of this software to return to
   54  *
   55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   56  *  School of Computer Science
   57  *  Carnegie Mellon University
   58  *  Pittsburgh PA 15213-3890
   59  *
   60  * any improvements or extensions that they make and grant Carnegie the
   61  * rights to redistribute these changes.
   62  */
   63 
   64 /*
   65  * uvm_pdaemon.c: the page daemon
   66  */
   67 
   68 #include <sys/cdefs.h>
   69 __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.133 2021/04/17 21:37:21 mrg Exp $");
   70 
   71 #include "opt_uvmhist.h"
   72 #include "opt_readahead.h"
   73 
   74 #define __RWLOCK_PRIVATE
   75 
   76 #include <sys/param.h>
   77 #include <sys/proc.h>
   78 #include <sys/systm.h>
   79 #include <sys/kernel.h>
   80 #include <sys/pool.h>
   81 #include <sys/buf.h>
   82 #include <sys/module.h>
   83 #include <sys/atomic.h>
   84 #include <sys/kthread.h>
   85 
   86 #include <uvm/uvm.h>
   87 #include <uvm/uvm_pdpolicy.h>
   88 #include <uvm/uvm_pgflcache.h>
   89 
   90 #ifdef UVMHIST
   91 #ifndef UVMHIST_PDHIST_SIZE
   92 #define UVMHIST_PDHIST_SIZE 100
   93 #endif
   94 static struct kern_history_ent pdhistbuf[UVMHIST_PDHIST_SIZE];
   95 UVMHIST_DEFINE(pdhist) = UVMHIST_INITIALIZER(pdhisthist, pdhistbuf);
   96 #endif
   97 
   98 /*
   99  * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
  100  * in a pass thru the inactive list when swap is full.  the value should be
  101  * "small"... if it's too large we'll cycle the active pages thru the inactive
  102  * queue too quickly to for them to be referenced and avoid being freed.
  103  */
  104 
  105 #define UVMPD_NUMDIRTYREACTS    16
  106 
  107 /*
  108  * local prototypes
  109  */
  110 
  111 static void     uvmpd_scan(void);
  112 static void     uvmpd_scan_queue(void);
  113 static void     uvmpd_tune(void);
  114 static void     uvmpd_pool_drain_thread(void *);
  115 static void     uvmpd_pool_drain_wakeup(void);
  116 
  117 static unsigned int uvm_pagedaemon_waiters;
  118 
  119 /* State for the pool drainer thread */
  120 static kmutex_t uvmpd_lock __cacheline_aligned;
  121 static kcondvar_t uvmpd_pool_drain_cv;
  122 static bool uvmpd_pool_drain_run = false;
  123 
  124 /*
  125  * XXX hack to avoid hangs when large processes fork.
  126  */
  127 u_int uvm_extrapages;
  128 
  129 /*
  130  * uvm_wait: wait (sleep) for the page daemon to free some pages
  131  *
  132  * => should be called with all locks released
  133  * => should _not_ be called by the page daemon (to avoid deadlock)
  134  */
  135 
  136 void
  137 uvm_wait(const char *wmsg)
  138 {
  139         int timo = 0;
  140 
  141         if (uvm.pagedaemon_lwp == NULL)
  142                 panic("out of memory before the pagedaemon thread exists");
  143 
  144         mutex_spin_enter(&uvmpd_lock);
  145 
  146         /*
  147          * check for page daemon going to sleep (waiting for itself)
  148          */
  149 
  150         if (curlwp == uvm.pagedaemon_lwp && uvmexp.paging == 0) {
  151                 /*
  152                  * now we have a problem: the pagedaemon wants to go to
  153                  * sleep until it frees more memory.   but how can it
  154                  * free more memory if it is asleep?  that is a deadlock.
  155                  * we have two options:
  156                  *  [1] panic now
  157                  *  [2] put a timeout on the sleep, thus causing the
  158                  *      pagedaemon to only pause (rather than sleep forever)
  159                  *
  160                  * note that option [2] will only help us if we get lucky
  161                  * and some other process on the system breaks the deadlock
  162                  * by exiting or freeing memory (thus allowing the pagedaemon
  163                  * to continue).  for now we panic if DEBUG is defined,
  164                  * otherwise we hope for the best with option [2] (better
  165                  * yet, this should never happen in the first place!).
  166                  */
  167 
  168                 printf("pagedaemon: deadlock detected!\n");
  169                 timo = hz >> 3;         /* set timeout */
  170 #if defined(DEBUG)
  171                 /* DEBUG: panic so we can debug it */
  172                 panic("pagedaemon deadlock");
  173 #endif
  174         }
  175 
  176         uvm_pagedaemon_waiters++;
  177         wakeup(&uvm.pagedaemon);                /* wake the daemon! */
  178         UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvmpd_lock, false, wmsg, timo);
  179 }
  180 
  181 /*
  182  * uvm_kick_pdaemon: perform checks to determine if we need to
  183  * give the pagedaemon a nudge, and do so if necessary.
  184  */
  185 
  186 void
  187 uvm_kick_pdaemon(void)
  188 {
  189         int fpages = uvm_availmem(false);
  190 
  191         if (fpages + uvmexp.paging < uvmexp.freemin ||
  192             (fpages + uvmexp.paging < uvmexp.freetarg &&
  193              uvmpdpol_needsscan_p()) ||
  194              uvm_km_va_starved_p()) {
  195                 mutex_spin_enter(&uvmpd_lock);
  196                 wakeup(&uvm.pagedaemon);
  197                 mutex_spin_exit(&uvmpd_lock);
  198         }
  199 }
  200 
  201 /*
  202  * uvmpd_tune: tune paging parameters
  203  *
  204  * => called when ever memory is added (or removed?) to the system
  205  */
  206 
  207 static void
  208 uvmpd_tune(void)
  209 {
  210         int val;
  211 
  212         UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
  213 
  214         /*
  215          * try to keep 0.5% of available RAM free, but limit to between
  216          * 128k and 1024k per-CPU.  XXX: what are these values good for?
  217          */
  218         val = uvmexp.npages / 200;
  219         val = MAX(val, (128*1024) >> PAGE_SHIFT);
  220         val = MIN(val, (1024*1024) >> PAGE_SHIFT);
  221         val *= ncpu;
  222 
  223         /* Make sure there's always a user page free. */
  224         if (val < uvmexp.reserve_kernel + 1)
  225                 val = uvmexp.reserve_kernel + 1;
  226         uvmexp.freemin = val;
  227 
  228         /* Calculate free target. */
  229         val = (uvmexp.freemin * 4) / 3;
  230         if (val <= uvmexp.freemin)
  231                 val = uvmexp.freemin + 1;
  232         uvmexp.freetarg = val + atomic_swap_uint(&uvm_extrapages, 0);
  233 
  234         uvmexp.wiredmax = uvmexp.npages / 3;
  235         UVMHIST_LOG(pdhist, "<- done, freemin=%jd, freetarg=%jd, wiredmax=%jd",
  236               uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0);
  237 }
  238 
  239 /*
  240  * uvm_pageout: the main loop for the pagedaemon
  241  */
  242 
  243 void
  244 uvm_pageout(void *arg)
  245 {
  246         int npages = 0;
  247         int extrapages = 0;
  248         int fpages;
  249 
  250         UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
  251 
  252         UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);
  253 
  254         mutex_init(&uvmpd_lock, MUTEX_DEFAULT, IPL_VM);
  255         cv_init(&uvmpd_pool_drain_cv, "pooldrain");
  256 
  257         /* Create the pool drainer kernel thread. */
  258         if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL,
  259             uvmpd_pool_drain_thread, NULL, NULL, "pooldrain"))
  260                 panic("fork pooldrain");
  261 
  262         /*
  263          * ensure correct priority and set paging parameters...
  264          */
  265 
  266         uvm.pagedaemon_lwp = curlwp;
  267         npages = uvmexp.npages;
  268         uvmpd_tune();
  269 
  270         /*
  271          * main loop
  272          */
  273 
  274         for (;;) {
  275                 bool needsscan, needsfree, kmem_va_starved;
  276 
  277                 kmem_va_starved = uvm_km_va_starved_p();
  278 
  279                 mutex_spin_enter(&uvmpd_lock);
  280                 if ((uvm_pagedaemon_waiters == 0 || uvmexp.paging > 0) &&
  281                     !kmem_va_starved) {
  282                         UVMHIST_LOG(pdhist,"  <<SLEEPING>>",0,0,0,0);
  283                         UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
  284                             &uvmpd_lock, false, "pgdaemon", 0);
  285                         uvmexp.pdwoke++;
  286                         UVMHIST_LOG(pdhist,"  <<WOKE UP>>",0,0,0,0);
  287                 } else {
  288                         mutex_spin_exit(&uvmpd_lock);
  289                 }
  290 
  291                 /*
  292                  * now recompute inactive count
  293                  */
  294 
  295                 if (npages != uvmexp.npages || extrapages != uvm_extrapages) {
  296                         npages = uvmexp.npages;
  297                         extrapages = uvm_extrapages;
  298                         uvmpd_tune();
  299                 }
  300 
  301                 uvmpdpol_tune();
  302 
  303                 /*
  304                  * Estimate a hint.  Note that bufmem are returned to
  305                  * system only when entire pool page is empty.
  306                  */
  307                 fpages = uvm_availmem(false);
  308                 UVMHIST_LOG(pdhist,"  free/ftarg=%jd/%jd",
  309                     fpages, uvmexp.freetarg, 0,0);
  310 
  311                 needsfree = fpages + uvmexp.paging < uvmexp.freetarg;
  312                 needsscan = needsfree || uvmpdpol_needsscan_p();
  313 
  314                 /*
  315                  * scan if needed
  316                  */
  317                 if (needsscan) {
  318                         uvmpd_scan();
  319                 }
  320 
  321                 /*
  322                  * if there's any free memory to be had,
  323                  * wake up any waiters.
  324                  */
  325                 if (uvm_availmem(false) > uvmexp.reserve_kernel ||
  326                     uvmexp.paging == 0) {
  327                         mutex_spin_enter(&uvmpd_lock);
  328                         wakeup(&uvmexp.free);
  329                         uvm_pagedaemon_waiters = 0;
  330                         mutex_spin_exit(&uvmpd_lock);
  331                 }
  332 
  333                 /*
  334                  * scan done.  if we don't need free memory, we're done.
  335                  */
  336 
  337                 if (!needsfree && !kmem_va_starved)
  338                         continue;
  339 
  340                 /*
  341                  * kick the pool drainer thread.
  342                  */
  343 
  344                 uvmpd_pool_drain_wakeup();
  345         }
  346         /*NOTREACHED*/
  347 }
  348 
  349 void
  350 uvm_pageout_start(int npages)
  351 {
  352 
  353         atomic_add_int(&uvmexp.paging, npages);
  354 }
  355 
  356 void
  357 uvm_pageout_done(int npages)
  358 {
  359 
  360         KASSERT(atomic_load_relaxed(&uvmexp.paging) >= npages);
  361 
  362         if (npages == 0) {
  363                 return;
  364         }
  365 
  366         atomic_add_int(&uvmexp.paging, -npages);
  367 
  368         /*
  369          * wake up either of pagedaemon or LWPs waiting for it.
  370          */
  371 
  372         mutex_spin_enter(&uvmpd_lock);
  373         if (uvm_availmem(false) <= uvmexp.reserve_kernel) {
  374                 wakeup(&uvm.pagedaemon);
  375         } else if (uvm_pagedaemon_waiters != 0) {
  376                 wakeup(&uvmexp.free);
  377                 uvm_pagedaemon_waiters = 0;
  378         }
  379         mutex_spin_exit(&uvmpd_lock);
  380 }
  381 
  382 static krwlock_t *
  383 uvmpd_page_owner_lock(struct vm_page *pg)
  384 {
  385         struct uvm_object *uobj = pg->uobject;
  386         struct vm_anon *anon = pg->uanon;
  387         krwlock_t *slock;
  388 
  389         KASSERT(mutex_owned(&pg->interlock));
  390 
  391 #ifdef DEBUG
  392         if (uobj == (void *)0xdeadbeef || anon == (void *)0xdeadbeef) {
  393                 return NULL;
  394         }
  395 #endif
  396         if (uobj != NULL) {
  397                 slock = uobj->vmobjlock;
  398                 KASSERTMSG(slock != NULL, "pg %p uobj %p, NULL lock", pg, uobj);
  399         } else if (anon != NULL) {
  400                 slock = anon->an_lock;
  401                 KASSERTMSG(slock != NULL, "pg %p anon %p, NULL lock", pg, anon);
  402         } else {
  403                 slock = NULL;
  404         }
  405         return slock;
  406 }
  407 
  408 /*
  409  * uvmpd_trylockowner: trylock the page's owner.
  410  *
  411  * => called with page interlock held.
  412  * => resolve orphaned O->A loaned page.
  413  * => return the locked mutex on success.  otherwise, return NULL.
  414  */
  415 
  416 krwlock_t *
  417 uvmpd_trylockowner(struct vm_page *pg)
  418 {
  419         krwlock_t *slock, *heldslock;
  420 
  421         KASSERT(mutex_owned(&pg->interlock));
  422 
  423         slock = uvmpd_page_owner_lock(pg);
  424         if (slock == NULL) {
  425                 /* Page may be in state of flux - ignore. */
  426                 mutex_exit(&pg->interlock);
  427                 return NULL;
  428         }
  429 
  430         if (rw_tryenter(slock, RW_WRITER)) {
  431                 goto success;
  432         }
  433 
  434         /*
  435          * The try-lock didn't work, so now do a blocking lock after
  436          * dropping the page interlock.  Prevent the owner lock from
  437          * being freed by taking a hold on it first.
  438          */
  439 
  440         rw_obj_hold(slock);
  441         mutex_exit(&pg->interlock);
  442         rw_enter(slock, RW_WRITER);
  443         heldslock = slock;
  444 
  445         /*
  446          * Now we hold some owner lock.  Check if the lock we hold
  447          * is still the lock for the owner of the page.
  448          * If it is then return it, otherwise release it and return NULL.
  449          */
  450 
  451         mutex_enter(&pg->interlock);
  452         slock = uvmpd_page_owner_lock(pg);
  453         if (heldslock != slock) {
  454                 rw_exit(heldslock);
  455                 slock = NULL;
  456         }
  457         rw_obj_free(heldslock);
  458         if (slock != NULL) {
  459 success:
  460                 /*
  461                  * Set PG_ANON if it isn't set already.
  462                  */
  463                 if (pg->uobject == NULL && (pg->flags & PG_ANON) == 0) {
  464                         KASSERT(pg->loan_count > 0);
  465                         pg->loan_count--;
  466                         pg->flags |= PG_ANON;
  467                         /* anon now owns it */
  468                 }
  469         }
  470         mutex_exit(&pg->interlock);
  471         return slock;
  472 }
  473 
  474 #if defined(VMSWAP)
  475 struct swapcluster {
  476         int swc_slot;
  477         int swc_nallocated;
  478         int swc_nused;
  479         struct vm_page *swc_pages[howmany(MAXPHYS, MIN_PAGE_SIZE)];
  480 };
  481 
  482 static void
  483 swapcluster_init(struct swapcluster *swc)
  484 {
  485 
  486         swc->swc_slot = 0;
  487         swc->swc_nused = 0;
  488 }
  489 
  490 static int
  491 swapcluster_allocslots(struct swapcluster *swc)
  492 {
  493         int slot;
  494         int npages;
  495 
  496         if (swc->swc_slot != 0) {
  497                 return 0;
  498         }
  499 
  500         /* Even with strange MAXPHYS, the shift
  501            implicitly rounds down to a page. */
  502         npages = MAXPHYS >> PAGE_SHIFT;
  503         slot = uvm_swap_alloc(&npages, true);
  504         if (slot == 0) {
  505                 return ENOMEM;
  506         }
  507         swc->swc_slot = slot;
  508         swc->swc_nallocated = npages;
  509         swc->swc_nused = 0;
  510 
  511         return 0;
  512 }
  513 
  514 static int
  515 swapcluster_add(struct swapcluster *swc, struct vm_page *pg)
  516 {
  517         int slot;
  518         struct uvm_object *uobj;
  519 
  520         KASSERT(swc->swc_slot != 0);
  521         KASSERT(swc->swc_nused < swc->swc_nallocated);
  522         KASSERT((pg->flags & PG_SWAPBACKED) != 0);
  523 
  524         slot = swc->swc_slot + swc->swc_nused;
  525         uobj = pg->uobject;
  526         if (uobj == NULL) {
  527                 KASSERT(rw_write_held(pg->uanon->an_lock));
  528                 pg->uanon->an_swslot = slot;
  529         } else {
  530                 int result;
  531 
  532                 KASSERT(rw_write_held(uobj->vmobjlock));
  533                 result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot);
  534                 if (result == -1) {
  535                         return ENOMEM;
  536                 }
  537         }
  538         swc->swc_pages[swc->swc_nused] = pg;
  539         swc->swc_nused++;
  540 
  541         return 0;
  542 }
  543 
  544 static void
  545 swapcluster_flush(struct swapcluster *swc, bool now)
  546 {
  547         int slot;
  548         int nused;
  549         int nallocated;
  550         int error __diagused;
  551 
  552         if (swc->swc_slot == 0) {
  553                 return;
  554         }
  555         KASSERT(swc->swc_nused <= swc->swc_nallocated);
  556 
  557         slot = swc->swc_slot;
  558         nused = swc->swc_nused;
  559         nallocated = swc->swc_nallocated;
  560 
  561         /*
  562          * if this is the final pageout we could have a few
  563          * unused swap blocks.  if so, free them now.
  564          */
  565 
  566         if (nused < nallocated) {
  567                 if (!now) {
  568                         return;
  569                 }
  570                 uvm_swap_free(slot + nused, nallocated - nused);
  571         }
  572 
  573         /*
  574          * now start the pageout.
  575          */
  576 
  577         if (nused > 0) {
  578                 uvmexp.pdpageouts++;
  579                 uvm_pageout_start(nused);
  580                 error = uvm_swap_put(slot, swc->swc_pages, nused, 0);
  581                 KASSERT(error == 0 || error == ENOMEM);
  582         }
  583 
  584         /*
  585          * zero swslot to indicate that we are
  586          * no longer building a swap-backed cluster.
  587          */
  588 
  589         swc->swc_slot = 0;
  590         swc->swc_nused = 0;
  591 }
  592 
  593 static int
  594 swapcluster_nused(struct swapcluster *swc)
  595 {
  596 
  597         return swc->swc_nused;
  598 }
  599 
  600 /*
  601  * uvmpd_dropswap: free any swap allocated to this page.
  602  *
  603  * => called with owner locked.
  604  * => return true if a page had an associated slot.
  605  */
  606 
  607 bool
  608 uvmpd_dropswap(struct vm_page *pg)
  609 {
  610         bool result = false;
  611         struct vm_anon *anon = pg->uanon;
  612 
  613         if ((pg->flags & PG_ANON) && anon->an_swslot) {
  614                 uvm_swap_free(anon->an_swslot, 1);
  615                 anon->an_swslot = 0;
  616                 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
  617                 result = true;
  618         } else if (pg->flags & PG_AOBJ) {
  619                 int slot = uao_set_swslot(pg->uobject,
  620                     pg->offset >> PAGE_SHIFT, 0);
  621                 if (slot) {
  622                         uvm_swap_free(slot, 1);
  623                         uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
  624                         result = true;
  625                 }
  626         }
  627 
  628         return result;
  629 }
  630 
  631 #endif /* defined(VMSWAP) */
  632 
  633 /*
  634  * uvmpd_scan_queue: scan an replace candidate list for pages
  635  * to clean or free.
  636  *
  637  * => we work on meeting our free target by converting inactive pages
  638  *    into free pages.
  639  * => we handle the building of swap-backed clusters
  640  */
  641 
  642 static void
  643 uvmpd_scan_queue(void)
  644 {
  645         struct vm_page *p;
  646         struct uvm_object *uobj;
  647         struct vm_anon *anon;
  648 #if defined(VMSWAP)
  649         struct swapcluster swc;
  650 #endif /* defined(VMSWAP) */
  651         int dirtyreacts;
  652         krwlock_t *slock;
  653         UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
  654 
  655         /*
  656          * swslot is non-zero if we are building a swap cluster.  we want
  657          * to stay in the loop while we have a page to scan or we have
  658          * a swap-cluster to build.
  659          */
  660 
  661 #if defined(VMSWAP)
  662         swapcluster_init(&swc);
  663 #endif /* defined(VMSWAP) */
  664 
  665         dirtyreacts = 0;
  666         uvmpdpol_scaninit();
  667 
  668         while (/* CONSTCOND */ 1) {
  669 
  670                 /*
  671                  * see if we've met the free target.
  672                  */
  673 
  674                 if (uvm_availmem(false) + uvmexp.paging
  675 #if defined(VMSWAP)
  676                     + swapcluster_nused(&swc)
  677 #endif /* defined(VMSWAP) */
  678                     >= uvmexp.freetarg << 2 ||
  679                     dirtyreacts == UVMPD_NUMDIRTYREACTS) {
  680                         UVMHIST_LOG(pdhist,"  met free target: "
  681                                     "exit loop", 0, 0, 0, 0);
  682                         break;
  683                 }
  684 
  685                 /*
  686                  * first we have the pdpolicy select a victim page
  687                  * and attempt to lock the object that the page
  688                  * belongs to.  if our attempt fails we skip on to
  689                  * the next page (no harm done).  it is important to
  690                  * "try" locking the object as we are locking in the
  691                  * wrong order (pageq -> object) and we don't want to
  692                  * deadlock.
  693                  *
  694                  * the only time we expect to see an ownerless page
  695                  * (i.e. a page with no uobject and !PG_ANON) is if an
  696                  * anon has loaned a page from a uvm_object and the
  697                  * uvm_object has dropped the ownership.  in that
  698                  * case, the anon can "take over" the loaned page
  699                  * and make it its own.
  700                  */
  701 
  702                 p = uvmpdpol_selectvictim(&slock);
  703                 if (p == NULL) {
  704                         break;
  705                 }
  706                 KASSERT(uvmpdpol_pageisqueued_p(p));
  707                 KASSERT(uvm_page_owner_locked_p(p, true));
  708                 KASSERT(p->wire_count == 0);
  709 
  710                 /*
  711                  * we are below target and have a new page to consider.
  712                  */
  713 
  714                 anon = p->uanon;
  715                 uobj = p->uobject;
  716 
  717                 if (p->flags & PG_BUSY) {
  718                         rw_exit(slock);
  719                         uvmexp.pdbusy++;
  720                         continue;
  721                 }
  722 
  723                 /* does the page belong to an object? */
  724                 if (uobj != NULL) {
  725                         uvmexp.pdobscan++;
  726                 } else {
  727 #if defined(VMSWAP)
  728                         KASSERT(anon != NULL);
  729                         uvmexp.pdanscan++;
  730 #else /* defined(VMSWAP) */
  731                         panic("%s: anon", __func__);
  732 #endif /* defined(VMSWAP) */
  733                 }
  734 
  735 
  736                 /*
  737                  * we now have the object locked.
  738                  * if the page is not swap-backed, call the object's
  739                  * pager to flush and free the page.
  740                  */
  741 
  742 #if defined(READAHEAD_STATS)
  743                 if ((p->flags & PG_READAHEAD) != 0) {
  744                         p->flags &= ~PG_READAHEAD;
  745                         uvm_ra_miss.ev_count++;
  746                 }
  747 #endif /* defined(READAHEAD_STATS) */
  748 
  749                 if ((p->flags & PG_SWAPBACKED) == 0) {
  750                         KASSERT(uobj != NULL);
  751                         (void) (uobj->pgops->pgo_put)(uobj, p->offset,
  752                             p->offset + PAGE_SIZE, PGO_CLEANIT|PGO_FREE);
  753                         continue;
  754                 }
  755 
  756                 /*
  757                  * the page is swap-backed.  remove all the permissions
  758                  * from the page so we can sync the modified info
  759                  * without any race conditions.  if the page is clean
  760                  * we can free it now and continue.
  761                  */
  762 
  763                 pmap_page_protect(p, VM_PROT_NONE);
  764                 if (uvm_pagegetdirty(p) == UVM_PAGE_STATUS_UNKNOWN) {
  765                         if (pmap_clear_modify(p)) {
  766                                 uvm_pagemarkdirty(p, UVM_PAGE_STATUS_DIRTY);
  767                         } else {
  768                                 uvm_pagemarkdirty(p, UVM_PAGE_STATUS_CLEAN);
  769                         }
  770                 }
  771                 if (uvm_pagegetdirty(p) != UVM_PAGE_STATUS_DIRTY) {
  772                         int slot;
  773                         int pageidx;
  774 
  775                         pageidx = p->offset >> PAGE_SHIFT;
  776                         uvm_pagefree(p);
  777                         atomic_inc_uint(&uvmexp.pdfreed);
  778 
  779                         /*
  780                          * for anons, we need to remove the page
  781                          * from the anon ourselves.  for aobjs,
  782                          * pagefree did that for us.
  783                          */
  784 
  785                         if (anon) {
  786                                 KASSERT(anon->an_swslot != 0);
  787                                 anon->an_page = NULL;
  788                                 slot = anon->an_swslot;
  789                         } else {
  790                                 slot = uao_find_swslot(uobj, pageidx);
  791                         }
  792                         if (slot > 0) {
  793                                 /* this page is now only in swap. */
  794                                 KASSERT(uvmexp.swpgonly < uvmexp.swpginuse);
  795                                 atomic_inc_uint(&uvmexp.swpgonly);
  796                         }
  797                         rw_exit(slock);
  798                         continue;
  799                 }
  800 
  801 #if defined(VMSWAP)
  802                 /*
  803                  * this page is dirty, skip it if we'll have met our
  804                  * free target when all the current pageouts complete.
  805                  */
  806 
  807                 if (uvm_availmem(false) + uvmexp.paging >
  808                     uvmexp.freetarg << 2) {
  809                         rw_exit(slock);
  810                         continue;
  811                 }
  812 
  813                 /*
  814                  * free any swap space allocated to the page since
  815                  * we'll have to write it again with its new data.
  816                  */
  817 
  818                 uvmpd_dropswap(p);
  819 
  820                 /*
  821                  * start new swap pageout cluster (if necessary).
  822                  *
  823                  * if swap is full reactivate this page so that
  824                  * we eventually cycle all pages through the
  825                  * inactive queue.
  826                  */
  827 
  828                 if (swapcluster_allocslots(&swc)) {
  829                         dirtyreacts++;
  830                         uvm_pagelock(p);
  831                         uvm_pageactivate(p);
  832                         uvm_pageunlock(p);
  833                         rw_exit(slock);
  834                         continue;
  835                 }
  836 
  837                 /*
  838                  * at this point, we're definitely going reuse this
  839                  * page.  mark the page busy and delayed-free.
  840                  * we should remove the page from the page queues
  841                  * so we don't ever look at it again.
  842                  * adjust counters and such.
  843                  */
  844 
  845                 p->flags |= PG_BUSY;
  846                 UVM_PAGE_OWN(p, "scan_queue");
  847                 p->flags |= PG_PAGEOUT;
  848                 uvmexp.pgswapout++;
  849 
  850                 uvm_pagelock(p);
  851                 uvm_pagedequeue(p);
  852                 uvm_pageunlock(p);
  853 
  854                 /*
  855                  * add the new page to the cluster.
  856                  */
  857 
  858                 if (swapcluster_add(&swc, p)) {
  859                         p->flags &= ~(PG_BUSY|PG_PAGEOUT);
  860                         UVM_PAGE_OWN(p, NULL);
  861                         dirtyreacts++;
  862                         uvm_pagelock(p);
  863                         uvm_pageactivate(p);
  864                         uvm_pageunlock(p);
  865                         rw_exit(slock);
  866                         continue;
  867                 }
  868                 rw_exit(slock);
  869 
  870                 swapcluster_flush(&swc, false);
  871 
  872                 /*
  873                  * the pageout is in progress.  bump counters and set up
  874                  * for the next loop.
  875                  */
  876 
  877                 atomic_inc_uint(&uvmexp.pdpending);
  878 
  879 #else /* defined(VMSWAP) */
  880                 uvm_pagelock(p);
  881                 uvm_pageactivate(p);
  882                 uvm_pageunlock(p);
  883                 rw_exit(slock);
  884 #endif /* defined(VMSWAP) */
  885         }
  886 
  887         uvmpdpol_scanfini();
  888 
  889 #if defined(VMSWAP)
  890         swapcluster_flush(&swc, true);
  891 #endif /* defined(VMSWAP) */
  892 }
  893 
  894 /*
  895  * uvmpd_scan: scan the page queues and attempt to meet our targets.
  896  */
  897 
  898 static void
  899 uvmpd_scan(void)
  900 {
  901         int swap_shortage, pages_freed, fpages;
  902         UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
  903 
  904         uvmexp.pdrevs++;
  905 
  906         /*
  907          * work on meeting our targets.   first we work on our free target
  908          * by converting inactive pages into free pages.  then we work on
  909          * meeting our inactive target by converting active pages to
  910          * inactive ones.
  911          */
  912 
  913         UVMHIST_LOG(pdhist, "  starting 'free' loop",0,0,0,0);
  914 
  915         pages_freed = uvmexp.pdfreed;
  916         uvmpd_scan_queue();
  917         pages_freed = uvmexp.pdfreed - pages_freed;
  918 
  919         /*
  920          * detect if we're not going to be able to page anything out
  921          * until we free some swap resources from active pages.
  922          */
  923 
  924         swap_shortage = 0;
  925         fpages = uvm_availmem(false);
  926         if (fpages < uvmexp.freetarg &&
  927             uvmexp.swpginuse >= uvmexp.swpgavail &&
  928             !uvm_swapisfull() &&
  929             pages_freed == 0) {
  930                 swap_shortage = uvmexp.freetarg - fpages;
  931         }
  932 
  933         uvmpdpol_balancequeue(swap_shortage);
  934 
  935         /*
  936          * if still below the minimum target, try unloading kernel
  937          * modules.
  938          */
  939 
  940         if (uvm_availmem(false) < uvmexp.freemin) {
  941                 module_thread_kick();
  942         }
  943 }
  944 
  945 /*
  946  * uvm_reclaimable: decide whether to wait for pagedaemon.
  947  *
  948  * => return true if it seems to be worth to do uvm_wait.
  949  *
  950  * XXX should be tunable.
  951  * XXX should consider pools, etc?
  952  */
  953 
  954 bool
  955 uvm_reclaimable(void)
  956 {
  957         int filepages;
  958         int active, inactive;
  959 
  960         /*
  961          * if swap is not full, no problem.
  962          */
  963 
  964         if (!uvm_swapisfull()) {
  965                 return true;
  966         }
  967 
  968         /*
  969          * file-backed pages can be reclaimed even when swap is full.
  970          * if we have more than 1/16 of pageable memory or 5MB, try to reclaim.
  971          * NB: filepages calculation does not exclude EXECPAGES - intentional.
  972          *
  973          * XXX assume the worst case, ie. all wired pages are file-backed.
  974          *
  975          * XXX should consider about other reclaimable memory.
  976          * XXX ie. pools, traditional buffer cache.
  977          */
  978 
  979         cpu_count_sync(false);
  980         filepages = (int)(cpu_count_get(CPU_COUNT_FILECLEAN) +
  981             cpu_count_get(CPU_COUNT_FILEUNKNOWN) +
  982             cpu_count_get(CPU_COUNT_FILEDIRTY) - uvmexp.wired);
  983         uvm_estimatepageable(&active, &inactive);
  984         if (filepages >= MIN((active + inactive) >> 4,
  985             5 * 1024 * 1024 >> PAGE_SHIFT)) {
  986                 return true;
  987         }
  988 
  989         /*
  990          * kill the process, fail allocation, etc..
  991          */
  992 
  993         return false;
  994 }
  995 
  996 void
  997 uvm_estimatepageable(int *active, int *inactive)
  998 {
  999 
 1000         uvmpdpol_estimatepageable(active, inactive);
 1001 }
 1002 
 1003 
 1004 /*
 1005  * Use a separate thread for draining pools.
 1006  * This work can't done from the main pagedaemon thread because
 1007  * some pool allocators need to take vm_map locks.
 1008  */
 1009 
 1010 static void
 1011 uvmpd_pool_drain_thread(void *arg)
 1012 {
 1013         struct pool *firstpool, *curpool;
 1014         int bufcnt, lastslept;
 1015         bool cycled;
 1016 
 1017         firstpool = NULL;
 1018         cycled = true;
 1019         for (;;) {
 1020                 /*
 1021                  * sleep until awoken by the pagedaemon.
 1022                  */
 1023                 mutex_enter(&uvmpd_lock);
 1024                 if (!uvmpd_pool_drain_run) {
 1025                         lastslept = getticks();
 1026                         cv_wait(&uvmpd_pool_drain_cv, &uvmpd_lock);
 1027                         if (getticks() != lastslept) {
 1028                                 cycled = false;
 1029                                 firstpool = NULL;
 1030                         }
 1031                 }
 1032                 uvmpd_pool_drain_run = false;
 1033                 mutex_exit(&uvmpd_lock);
 1034 
 1035                 /*
 1036                  * rate limit draining, otherwise in desperate circumstances
 1037                  * this can totally saturate the system with xcall activity.
 1038                  */
 1039                 if (cycled) {
 1040                         kpause("uvmpdlmt", false, 1, NULL);
 1041                         cycled = false;
 1042                         firstpool = NULL;
 1043                 }
 1044 
 1045                 /*
 1046                  * drain and temporarily disable the freelist cache.
 1047                  */
 1048                 uvm_pgflcache_pause();
 1049 
 1050                 /*
 1051                  * kill unused metadata buffers.
 1052                  */
 1053                 bufcnt = uvmexp.freetarg - uvm_availmem(false);
 1054                 if (bufcnt < 0)
 1055                         bufcnt = 0;
 1056 
 1057                 mutex_enter(&bufcache_lock);
 1058                 buf_drain(bufcnt << PAGE_SHIFT);
 1059                 mutex_exit(&bufcache_lock);
 1060 
 1061                 /*
 1062                  * drain a pool, and then re-enable the freelist cache.
 1063                  */
 1064                 (void)pool_drain(&curpool);
 1065                 KASSERT(curpool != NULL);
 1066                 if (firstpool == NULL) {
 1067                         firstpool = curpool;
 1068                 } else if (firstpool == curpool) {
 1069                         cycled = true;
 1070                 }
 1071                 uvm_pgflcache_resume();
 1072         }
 1073         /*NOTREACHED*/
 1074 }
 1075 
 1076 static void
 1077 uvmpd_pool_drain_wakeup(void)
 1078 {
 1079 
 1080         mutex_enter(&uvmpd_lock);
 1081         uvmpd_pool_drain_run = true;
 1082         cv_signal(&uvmpd_pool_drain_cv);
 1083         mutex_exit(&uvmpd_lock);
 1084 }

Cache object: 449b8670e9be1c8f36c0b412e73a5c65


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.