The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/common/os/vm_pageout.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or http://www.opensolaris.org/os/licensing.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 
   26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
   27 /*        All Rights Reserved   */
   28 
   29 /*
   30  * University Copyright- Copyright (c) 1982, 1986, 1988
   31  * The Regents of the University of California
   32  * All Rights Reserved
   33  *
   34  * University Acknowledgment- Portions of this document are derived from
   35  * software developed by the University of California, Berkeley, and its
   36  * contributors.
   37  */
   38 
   39 #include <sys/types.h>
   40 #include <sys/t_lock.h>
   41 #include <sys/param.h>
   42 #include <sys/buf.h>
   43 #include <sys/uio.h>
   44 #include <sys/proc.h>
   45 #include <sys/systm.h>
   46 #include <sys/mman.h>
   47 #include <sys/cred.h>
   48 #include <sys/vnode.h>
   49 #include <sys/vm.h>
   50 #include <sys/vmparam.h>
   51 #include <sys/vtrace.h>
   52 #include <sys/cmn_err.h>
   53 #include <sys/cpuvar.h>
   54 #include <sys/user.h>
   55 #include <sys/kmem.h>
   56 #include <sys/debug.h>
   57 #include <sys/callb.h>
   58 #include <sys/tnf_probe.h>
   59 #include <sys/mem_cage.h>
   60 #include <sys/time.h>
   61 
   62 #include <vm/hat.h>
   63 #include <vm/as.h>
   64 #include <vm/seg.h>
   65 #include <vm/page.h>
   66 #include <vm/pvn.h>
   67 #include <vm/seg_kmem.h>
   68 
   69 static int checkpage(page_t *, int);
   70 
   71 /*
   72  * The following parameters control operation of the page replacement
   73  * algorithm.  They are initialized to 0, and then computed at boot time
   74  * based on the size of the system.  If they are patched non-zero in
   75  * a loaded vmunix they are left alone and may thus be changed per system
   76  * using adb on the loaded system.
   77  */
   78 pgcnt_t         slowscan = 0;
   79 pgcnt_t         fastscan = 0;
   80 
   81 static pgcnt_t  handspreadpages = 0;
   82 static int      loopfraction = 2;
   83 static pgcnt_t  looppages;
   84 static int      min_percent_cpu = 4;
   85 static int      max_percent_cpu = 80;
   86 static pgcnt_t  maxfastscan = 0;
   87 static pgcnt_t  maxslowscan = 100;
   88 
   89 pgcnt_t maxpgio = 0;
   90 pgcnt_t minfree = 0;
   91 pgcnt_t desfree = 0;
   92 pgcnt_t lotsfree = 0;
   93 pgcnt_t needfree = 0;
   94 pgcnt_t throttlefree = 0;
   95 pgcnt_t pageout_reserve = 0;
   96 
   97 pgcnt_t deficit;
   98 pgcnt_t nscan;
   99 pgcnt_t desscan;
  100 
  101 /*
  102  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
  103  * are the number of ticks in each wakeup cycle that gives the
  104  * equivalent of some underlying %CPU duty cycle.
  105  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
  106  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
  107  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
  108  * So, for example, 4% == 1 tick and 80% == 20 ticks.
  109  *
  110  * min_pageout_ticks:
  111  *     ticks/wakeup equivalent of min_percent_cpu.
  112  *
  113  * max_pageout_ticks:
  114  *     ticks/wakeup equivalent of max_percent_cpu.
  115  *
  116  * pageout_ticks:
  117  *     Number of clock ticks budgeted for each wakeup cycle.
  118  *     Computed each time around by schedpaging().
  119  *     Varies between min_pageout_ticks .. max_pageout_ticks,
  120  *     depending on memory pressure.
  121  *
  122  * pageout_lbolt:
  123  *     Timestamp of the last time pageout_scanner woke up and started
  124  *     (or resumed) scanning for not recently referenced pages.
  125  */
  126 
  127 static clock_t  min_pageout_ticks;
  128 static clock_t  max_pageout_ticks;
  129 static clock_t  pageout_ticks;
  130 static clock_t  pageout_lbolt;
  131 
  132 static uint_t   reset_hands;
  133 
  134 #define PAGES_POLL_MASK 1023
  135 
  136 /*
  137  * pageout_sample_lim:
  138  *     The limit on the number of samples needed to establish a value
  139  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
  140  *
  141  * pageout_sample_cnt:
  142  *     Current sample number.  Once the sample gets large enough,
  143  *     set new values for handspreadpages, fastscan and slowscan.
  144  *
  145  * pageout_sample_pages:
  146  *     The accumulated number of pages scanned during sampling.
  147  *
  148  * pageout_sample_ticks:
  149  *     The accumulated clock ticks for the sample.
  150  *
  151  * pageout_rate:
  152  *     Rate in pages/nanosecond, computed at the end of sampling.
  153  *
  154  * pageout_new_spread:
  155  *     The new value to use for fastscan and handspreadpages.
  156  *     Calculated after enough samples have been taken.
  157  */
  158 
  159 typedef hrtime_t hrrate_t;
  160 
  161 static uint64_t pageout_sample_lim = 4;
  162 static uint64_t pageout_sample_cnt = 0;
  163 static pgcnt_t  pageout_sample_pages = 0;
  164 static hrrate_t pageout_rate = 0;
  165 static pgcnt_t  pageout_new_spread = 0;
  166 
  167 static clock_t  pageout_cycle_ticks;
  168 static hrtime_t sample_start, sample_end;
  169 static hrtime_t pageout_sample_etime = 0;
  170 
  171 /*
  172  * Record number of times a pageout_scanner wakeup cycle finished because it
  173  * timed out (exceeded its CPU budget), rather than because it visited
  174  * its budgeted number of pages.
  175  */
  176 uint64_t pageout_timeouts = 0;
  177 
  178 #ifdef VM_STATS
  179 static struct pageoutvmstats_str {
  180         ulong_t checkpage[3];
  181 } pageoutvmstats;
  182 #endif /* VM_STATS */
  183 
  184 /*
  185  * Threads waiting for free memory use this condition variable and lock until
  186  * memory becomes available.
  187  */
  188 kmutex_t        memavail_lock;
  189 kcondvar_t      memavail_cv;
  190 
  191 /*
  192  * The size of the clock loop.
  193  */
  194 #define LOOPPAGES       total_pages
  195 
  196 /*
  197  * Set up the paging constants for the clock algorithm.
  198  * Called after the system is initialized and the amount of memory
  199  * and number of paging devices is known.
  200  *
  201  * lotsfree is 1/64 of memory, but at least 512K.
  202  * desfree is 1/2 of lotsfree.
  203  * minfree is 1/2 of desfree.
  204  *
  205  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
  206  *
  207  *      lotsfree = btop(512K)
  208  *      desfree = btop(200K)
  209  *      minfree = btop(100K)
  210  *      throttlefree = INT_MIN
  211  *      max_percent_cpu = 4
  212  */
  213 void
  214 setupclock(int recalc)
  215 {
  216 
  217         static spgcnt_t init_lfree, init_dfree, init_mfree;
  218         static spgcnt_t init_tfree, init_preserve, init_mpgio;
  219         static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
  220 
  221         looppages = LOOPPAGES;
  222 
  223         /*
  224          * setupclock can now be called to recalculate the paging
  225          * parameters in the case of dynamic addition of memory.
  226          * So to make sure we make the proper calculations, if such a
  227          * situation should arise, we save away the initial values
  228          * of each parameter so we can recall them when needed. This
  229          * way we don't lose the settings an admin might have made
  230          * through the /etc/system file.
  231          */
  232 
  233         if (!recalc) {
  234                 init_lfree = lotsfree;
  235                 init_dfree = desfree;
  236                 init_mfree = minfree;
  237                 init_tfree = throttlefree;
  238                 init_preserve = pageout_reserve;
  239                 init_mpgio = maxpgio;
  240                 init_mfscan = maxfastscan;
  241                 init_fscan = fastscan;
  242                 init_sscan = slowscan;
  243                 init_hspages = handspreadpages;
  244         }
  245 
  246         /*
  247          * Set up thresholds for paging:
  248          */
  249 
  250         /*
  251          * Lotsfree is threshold where paging daemon turns on.
  252          */
  253         if (init_lfree == 0 || init_lfree >= looppages)
  254                 lotsfree = MAX(looppages / 64, btop(512 * 1024));
  255         else
  256                 lotsfree = init_lfree;
  257 
  258         /*
  259          * Desfree is amount of memory desired free.
  260          * If less than this for extended period, start swapping.
  261          */
  262         if (init_dfree == 0 || init_dfree >= lotsfree)
  263                 desfree = lotsfree / 2;
  264         else
  265                 desfree = init_dfree;
  266 
  267         /*
  268          * Minfree is minimal amount of free memory which is tolerable.
  269          */
  270         if (init_mfree == 0 || init_mfree >= desfree)
  271                 minfree = desfree / 2;
  272         else
  273                 minfree = init_mfree;
  274 
  275         /*
  276          * Throttlefree is the point at which we start throttling
  277          * PG_WAIT requests until enough memory becomes available.
  278          */
  279         if (init_tfree == 0 || init_tfree >= desfree)
  280                 throttlefree = minfree;
  281         else
  282                 throttlefree = init_tfree;
  283 
  284         /*
  285          * Pageout_reserve is the number of pages that we keep in
  286          * stock for pageout's own use.  Having a few such pages
  287          * provides insurance against system deadlock due to
  288          * pageout needing pages.  When freemem < pageout_reserve,
  289          * non-blocking allocations are denied to any threads
  290          * other than pageout and sched.  (At some point we might
  291          * want to consider a per-thread flag like T_PUSHING_PAGES
  292          * to indicate that a thread is part of the page-pushing
  293          * dance (e.g. an interrupt thread) and thus is entitled
  294          * to the same special dispensation we accord pageout.)
  295          */
  296         if (init_preserve == 0 || init_preserve >= throttlefree)
  297                 pageout_reserve = throttlefree / 2;
  298         else
  299                 pageout_reserve = init_preserve;
  300 
  301         /*
  302          * Maxpgio thresholds how much paging is acceptable.
  303          * This figures that 2/3 busy on an arm is all that is
  304          * tolerable for paging.  We assume one operation per disk rev.
  305          *
  306          * XXX - Does not account for multiple swap devices.
  307          */
  308         if (init_mpgio == 0)
  309                 maxpgio = (DISKRPM * 2) / 3;
  310         else
  311                 maxpgio = init_mpgio;
  312 
  313         /*
  314          * The clock scan rate varies between fastscan and slowscan
  315          * based on the amount of free memory available.  Fastscan
  316          * rate should be set based on the number pages that can be
  317          * scanned per sec using ~10% of processor time.  Since this
  318          * value depends on the processor, MMU, Mhz etc., it is
  319          * difficult to determine it in a generic manner for all
  320          * architectures.
  321          *
  322          * Instead of trying to determine the number of pages scanned
  323          * per sec for every processor, fastscan is set to be the smaller
  324          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
  325          * time is limited to ~4% of processor time.
  326          *
  327          * Setting fastscan to be 1/2 of memory allows pageout to scan
  328          * all of memory in ~2 secs.  This implies that user pages not
  329          * accessed within 1 sec (assuming, handspreadpages == fastscan)
  330          * can be reclaimed when free memory is very low.  Stealing pages
  331          * not accessed within 1 sec seems reasonable and ensures that
  332          * active user processes don't thrash.
  333          *
  334          * Smaller values of fastscan result in scanning fewer pages
  335          * every second and consequently pageout may not be able to free
  336          * sufficient memory to maintain the minimum threshold.  Larger
  337          * values of fastscan result in scanning a lot more pages which
  338          * could lead to thrashing and higher CPU usage.
  339          *
  340          * Fastscan needs to be limited to a maximum value and should not
  341          * scale with memory to prevent pageout from consuming too much
  342          * time for scanning on slow CPU's and avoid thrashing, as a
  343          * result of scanning too many pages, on faster CPU's.
  344          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
  345          * (the upper bound for fastscan) based on the average number
  346          * of pages that can potentially be scanned in ~1 sec (using ~4%
  347          * of the CPU) on some of the following machines that currently
  348          * run Solaris 2.x:
  349          *
  350          *                      average memory scanned in ~1 sec
  351          *
  352          *      25 Mhz SS1+:            23 Meg
  353          *      LX:                     37 Meg
  354          *      50 Mhz SC2000:          68 Meg
  355          *
  356          *      40 Mhz 486:             26 Meg
  357          *      66 Mhz 486:             42 Meg
  358          *
  359          * When free memory falls just below lotsfree, the scan rate
  360          * goes from 0 to slowscan (i.e., pageout starts running).  This
  361          * transition needs to be smooth and is achieved by ensuring that
  362          * pageout scans a small number of pages to satisfy the transient
  363          * memory demand.  This is set to not exceed 100 pages/sec (25 per
  364          * wakeup) since scanning that many pages has no noticible impact
  365          * on system performance.
  366          *
  367          * In addition to setting fastscan and slowscan, pageout is
  368          * limited to using ~4% of the CPU.  This results in increasing
  369          * the time taken to scan all of memory, which in turn means that
  370          * user processes have a better opportunity of preventing their
  371          * pages from being stolen.  This has a positive effect on
  372          * interactive and overall system performance when memory demand
  373          * is high.
  374          *
  375          * Thus, the rate at which pages are scanned for replacement will
  376          * vary linearly between slowscan and the number of pages that
  377          * can be scanned using ~4% of processor time instead of varying
  378          * linearly between slowscan and fastscan.
  379          *
  380          * Also, the processor time used by pageout will vary from ~1%
  381          * at slowscan to ~4% at fastscan instead of varying between
  382          * ~1% at slowscan and ~10% at fastscan.
  383          *
  384          * The values chosen for the various VM parameters (fastscan,
  385          * handspreadpages, etc) are not universally true for all machines,
  386          * but appear to be a good rule of thumb for the machines we've
  387          * tested.  They have the following ranges:
  388          *
  389          *      cpu speed:      20 to 70 Mhz
  390          *      page size:      4K to 8K
  391          *      memory size:    16M to 5G
  392          *      page scan rate: 4000 - 17400 4K pages per sec
  393          *
  394          * The values need to be re-examined for machines which don't
  395          * fall into the various ranges (e.g., slower or faster CPUs,
  396          * smaller or larger pagesizes etc) shown above.
  397          *
  398          * On an MP machine, pageout is often unable to maintain the
  399          * minimum paging thresholds under heavy load.  This is due to
  400          * the fact that user processes running on other CPU's can be
  401          * dirtying memory at a much faster pace than pageout can find
  402          * pages to free.  The memory demands could be met by enabling
  403          * more than one CPU to run the clock algorithm in such a manner
  404          * that the various clock hands don't overlap.  This also makes
  405          * it more difficult to determine the values for fastscan, slowscan
  406          * and handspreadpages.
  407          *
  408          * The swapper is currently used to free up memory when pageout
  409          * is unable to meet memory demands by swapping out processes.
  410          * In addition to freeing up memory, swapping also reduces the
  411          * demand for memory by preventing user processes from running
  412          * and thereby consuming memory.
  413          */
  414         if (init_mfscan == 0) {
  415                 if (pageout_new_spread != 0)
  416                         maxfastscan = pageout_new_spread;
  417                 else
  418                         maxfastscan = MAXHANDSPREADPAGES;
  419         } else {
  420                 maxfastscan = init_mfscan;
  421         }
  422         if (init_fscan == 0)
  423                 fastscan = MIN(looppages / loopfraction, maxfastscan);
  424         else
  425                 fastscan = init_fscan;
  426         if (fastscan > looppages / loopfraction)
  427                 fastscan = looppages / loopfraction;
  428 
  429         /*
  430          * Set slow scan time to 1/10 the fast scan time, but
  431          * not to exceed maxslowscan.
  432          */
  433         if (init_sscan == 0)
  434                 slowscan = MIN(fastscan / 10, maxslowscan);
  435         else
  436                 slowscan = init_sscan;
  437         if (slowscan > fastscan / 2)
  438                 slowscan = fastscan / 2;
  439 
  440         /*
  441          * Handspreadpages is distance (in pages) between front and back
  442          * pageout daemon hands.  The amount of time to reclaim a page
  443          * once pageout examines it increases with this distance and
  444          * decreases as the scan rate rises. It must be < the amount
  445          * of pageable memory.
  446          *
  447          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
  448          * to be "fastscan" results in the front hand being a few secs
  449          * (varies based on the processor speed) ahead of the back hand
  450          * at fastscan rates.  This distance can be further reduced, if
  451          * necessary, by increasing the processor time used by pageout
  452          * to be more than ~4% and preferrably not more than ~10%.
  453          *
  454          * As a result, user processes have a much better chance of
  455          * referencing their pages before the back hand examines them.
  456          * This also significantly lowers the number of reclaims from
  457          * the freelist since pageout does not end up freeing pages which
  458          * may be referenced a sec later.
  459          */
  460         if (init_hspages == 0)
  461                 handspreadpages = fastscan;
  462         else
  463                 handspreadpages = init_hspages;
  464 
  465         /*
  466          * Make sure that back hand follows front hand by at least
  467          * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
  468          * for the back hand to look at a page during the same wakeup of
  469          * the pageout daemon in which the front hand cleared its ref bit.
  470          */
  471         if (handspreadpages >= looppages)
  472                 handspreadpages = looppages - 1;
  473 
  474         /*
  475          * If we have been called to recalculate the parameters,
  476          * set a flag to re-evaluate the clock hand pointers.
  477          */
  478         if (recalc)
  479                 reset_hands = 1;
  480 }
  481 
  482 /*
  483  * Pageout scheduling.
  484  *
  485  * Schedpaging controls the rate at which the page out daemon runs by
  486  * setting the global variables nscan and desscan RATETOSCHEDPAGING
  487  * times a second.  Nscan records the number of pages pageout has examined
  488  * in its current pass; schedpaging resets this value to zero each time
  489  * it runs.  Desscan records the number of pages pageout should examine
  490  * in its next pass; schedpaging sets this value based on the amount of
  491  * currently available memory.
  492  */
  493 
  494 #define RATETOSCHEDPAGING       4               /* hz that is */
  495 
  496 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
  497 
  498 /*
  499  * Pool of available async pageout putpage requests.
  500  */
  501 static struct async_reqs *push_req;
  502 static struct async_reqs *req_freelist; /* available req structs */
  503 static struct async_reqs *push_list;    /* pending reqs */
  504 static kmutex_t push_lock;              /* protects req pool */
  505 static kcondvar_t push_cv;
  506 
  507 static int async_list_size = 256;       /* number of async request structs */
  508 
  509 static void pageout_scanner(void);
  510 
  511 /*
  512  * If a page is being shared more than "po_share" times
  513  * then leave it alone- don't page it out.
  514  */
  515 #define MIN_PO_SHARE    (8)
  516 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
  517 ulong_t po_share = MIN_PO_SHARE;
  518 
  519 /*
  520  * Schedule rate for paging.
  521  * Rate is linear interpolation between
  522  * slowscan with lotsfree and fastscan when out of memory.
  523  */
  524 static void
  525 schedpaging(void *arg)
  526 {
  527         spgcnt_t vavail;
  528 
  529         if (freemem < lotsfree + needfree + kmem_reapahead)
  530                 kmem_reap();
  531 
  532         if (freemem < lotsfree + needfree)
  533                 seg_preap();
  534 
  535         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
  536                 kcage_cageout_wakeup();
  537 
  538         if (mutex_tryenter(&pageout_mutex)) {
  539                 /* pageout() not running */
  540                 nscan = 0;
  541                 vavail = freemem - deficit;
  542                 if (pageout_new_spread != 0)
  543                         vavail -= needfree;
  544                 if (vavail < 0)
  545                         vavail = 0;
  546                 if (vavail > lotsfree)
  547                         vavail = lotsfree;
  548 
  549                 /*
  550                  * Fix for 1161438 (CRS SPR# 73922).  All variables
  551                  * in the original calculation for desscan were 32 bit signed
  552                  * ints.  As freemem approaches 0x0 on a system with 1 Gig or
  553                  * more of memory, the calculation can overflow.  When this
  554                  * happens, desscan becomes negative and pageout_scanner()
  555                  * stops paging out.
  556                  */
  557                 if ((needfree) && (pageout_new_spread == 0)) {
  558                         /*
  559                          * If we've not yet collected enough samples to
  560                          * calculate a spread, use the old logic of kicking
  561                          * into high gear anytime needfree is non-zero.
  562                          */
  563                         desscan = fastscan / RATETOSCHEDPAGING;
  564                 } else {
  565                         /*
  566                          * Once we've calculated a spread based on system
  567                          * memory and usage, just treat needfree as another
  568                          * form of deficit.
  569                          */
  570                         spgcnt_t faststmp, slowstmp, result;
  571 
  572                         slowstmp = slowscan * vavail;
  573                         faststmp = fastscan * (lotsfree - vavail);
  574                         result = (slowstmp + faststmp) /
  575                             nz(lotsfree) / RATETOSCHEDPAGING;
  576                         desscan = (pgcnt_t)result;
  577                 }
  578 
  579                 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
  580                     (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
  581 
  582                 if (freemem < lotsfree + needfree ||
  583                     pageout_sample_cnt < pageout_sample_lim) {
  584                         TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
  585                             "pageout_cv_signal:freemem %ld", freemem);
  586                         cv_signal(&proc_pageout->p_cv);
  587                 } else {
  588                         /*
  589                          * There are enough free pages, no need to
  590                          * kick the scanner thread.  And next time
  591                          * around, keep more of the `highly shared'
  592                          * pages.
  593                          */
  594                         cv_signal_pageout();
  595                         if (po_share > MIN_PO_SHARE) {
  596                                 po_share >>= 1;
  597                         }
  598                 }
  599                 mutex_exit(&pageout_mutex);
  600         }
  601 
  602         /*
  603          * Signal threads waiting for available memory.
  604          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
  605          * in this case it is not needed - the waiters will be waken up during
  606          * the next invocation of this function.
  607          */
  608         if (kmem_avail() > 0)
  609                 cv_broadcast(&memavail_cv);
  610 
  611         (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
  612 }
  613 
  614 pgcnt_t         pushes;
  615 ulong_t         push_list_size;         /* # of requests on pageout queue */
  616 
  617 #define FRONT   1
  618 #define BACK    2
  619 
  620 int dopageout = 1;      /* must be non-zero to turn page stealing on */
  621 
  622 /*
  623  * The page out daemon, which runs as process 2.
  624  *
  625  * As long as there are at least lotsfree pages,
  626  * this process is not run.  When the number of free
  627  * pages stays in the range desfree to lotsfree,
  628  * this daemon runs through the pages in the loop
  629  * at a rate determined in schedpaging().  Pageout manages
  630  * two hands on the clock.  The front hand moves through
  631  * memory, clearing the reference bit,
  632  * and stealing pages from procs that are over maxrss.
  633  * The back hand travels a distance behind the front hand,
  634  * freeing the pages that have not been referenced in the time
  635  * since the front hand passed.  If modified, they are pushed to
  636  * swap before being freed.
  637  *
  638  * There are 2 threads that act on behalf of the pageout process.
  639  * One thread scans pages (pageout_scanner) and frees them up if
  640  * they don't require any VOP_PUTPAGE operation. If a page must be
  641  * written back to its backing store, the request is put on a list
  642  * and the other (pageout) thread is signaled. The pageout thread
  643  * grabs VOP_PUTPAGE requests from the list, and processes them.
  644  * Some filesystems may require resources for the VOP_PUTPAGE
  645  * operations (like memory) and hence can block the pageout
  646  * thread, but the scanner thread can still operate. There is still
  647  * no guarantee that memory deadlocks cannot occur.
  648  *
  649  * For now, this thing is in very rough form.
  650  */
  651 void
  652 pageout()
  653 {
  654         struct async_reqs *arg;
  655         pri_t pageout_pri;
  656         int i;
  657         pgcnt_t max_pushes;
  658         callb_cpr_t cprinfo;
  659 
  660         proc_pageout = ttoproc(curthread);
  661         proc_pageout->p_cstime = 0;
  662         proc_pageout->p_stime =  0;
  663         proc_pageout->p_cutime =  0;
  664         proc_pageout->p_utime = 0;
  665         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
  666         bcopy("pageout", PTOU(curproc)->u_comm, 7);
  667 
  668         /*
  669          * Create pageout scanner thread
  670          */
  671         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
  672         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
  673 
  674         /*
  675          * Allocate and initialize the async request structures
  676          * for pageout.
  677          */
  678         push_req = (struct async_reqs *)
  679             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
  680 
  681         req_freelist = push_req;
  682         for (i = 0; i < async_list_size - 1; i++)
  683                 push_req[i].a_next = &push_req[i + 1];
  684 
  685         pageout_pri = curthread->t_pri;
  686 
  687         /* Create the pageout scanner thread. */
  688         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
  689             pageout_pri - 1);
  690 
  691         /*
  692          * kick off pageout scheduler.
  693          */
  694         schedpaging(NULL);
  695 
  696         /*
  697          * Create kernel cage thread.
  698          * The kernel cage thread is started under the pageout process
  699          * to take advantage of the less restricted page allocation
  700          * in page_create_throttle().
  701          */
  702         kcage_cageout_init();
  703 
  704         /*
  705          * Limit pushes to avoid saturating pageout devices.
  706          */
  707         max_pushes = maxpgio / RATETOSCHEDPAGING;
  708         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
  709 
  710         for (;;) {
  711                 mutex_enter(&push_lock);
  712 
  713                 while ((arg = push_list) == NULL || pushes > max_pushes) {
  714                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
  715                         cv_wait(&push_cv, &push_lock);
  716                         pushes = 0;
  717                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
  718                 }
  719                 push_list = arg->a_next;
  720                 arg->a_next = NULL;
  721                 mutex_exit(&push_lock);
  722 
  723                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
  724                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
  725                         pushes++;
  726                 }
  727 
  728                 /* vp held by checkpage() */
  729                 VN_RELE(arg->a_vp);
  730 
  731                 mutex_enter(&push_lock);
  732                 arg->a_next = req_freelist;     /* back on freelist */
  733                 req_freelist = arg;
  734                 push_list_size--;
  735                 mutex_exit(&push_lock);
  736         }
  737 }
  738 
  739 /*
  740  * Kernel thread that scans pages looking for ones to free
  741  */
  742 static void
  743 pageout_scanner(void)
  744 {
  745         struct page *fronthand, *backhand;
  746         uint_t count;
  747         callb_cpr_t cprinfo;
  748         pgcnt_t nscan_limit;
  749         pgcnt_t pcount;
  750 
  751         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
  752         mutex_enter(&pageout_mutex);
  753 
  754         /*
  755          * The restart case does not attempt to point the hands at roughly
  756          * the right point on the assumption that after one circuit things
  757          * will have settled down - and restarts shouldn't be that often.
  758          */
  759 
  760         /*
  761          * Set the two clock hands to be separated by a reasonable amount,
  762          * but no more than 360 degrees apart.
  763          */
  764         backhand = page_first();
  765         if (handspreadpages >= total_pages)
  766                 fronthand = page_nextn(backhand, total_pages - 1);
  767         else
  768                 fronthand = page_nextn(backhand, handspreadpages);
  769 
  770         min_pageout_ticks = MAX(1,
  771             ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
  772         max_pageout_ticks = MAX(min_pageout_ticks,
  773             ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
  774 
  775 loop:
  776         cv_signal_pageout();
  777 
  778         CALLB_CPR_SAFE_BEGIN(&cprinfo);
  779         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
  780         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
  781 
  782         if (!dopageout)
  783                 goto loop;
  784 
  785         if (reset_hands) {
  786                 reset_hands = 0;
  787 
  788                 backhand = page_first();
  789                 if (handspreadpages >= total_pages)
  790                         fronthand = page_nextn(backhand, total_pages - 1);
  791                 else
  792                         fronthand = page_nextn(backhand, handspreadpages);
  793         }
  794 
  795         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
  796         count = 0;
  797 
  798         TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
  799             "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
  800             freemem, lotsfree, nscan, desscan);
  801 
  802         /* Kernel probe */
  803         TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
  804             tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
  805 
  806         pcount = 0;
  807         if (pageout_sample_cnt < pageout_sample_lim) {
  808                 nscan_limit = total_pages;
  809         } else {
  810                 nscan_limit = desscan;
  811         }
  812         pageout_lbolt = ddi_get_lbolt();
  813         sample_start = gethrtime();
  814 
  815         /*
  816          * Scan the appropriate number of pages for a single duty cycle.
  817          * However, stop scanning as soon as there is enough free memory.
  818          * For a short while, we will be sampling the performance of the
  819          * scanner and need to keep running just to get sample data, in
  820          * which case we keep going and don't pay attention to whether
  821          * or not there is enough free memory.
  822          */
  823 
  824         while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
  825             pageout_sample_cnt < pageout_sample_lim)) {
  826                 int rvfront, rvback;
  827 
  828                 /*
  829                  * Check to see if we have exceeded our %CPU budget
  830                  * for this wakeup, but not on every single page visited,
  831                  * just every once in a while.
  832                  */
  833                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
  834                         pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
  835                         if (pageout_cycle_ticks >= pageout_ticks) {
  836                                 ++pageout_timeouts;
  837                                 break;
  838                         }
  839                 }
  840 
  841                 /*
  842                  * If checkpage manages to add a page to the free list,
  843                  * we give ourselves another couple of trips around the loop.
  844                  */
  845                 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
  846                         count = 0;
  847                 if ((rvback = checkpage(backhand, BACK)) == 1)
  848                         count = 0;
  849 
  850                 ++pcount;
  851 
  852                 /*
  853                  * protected by pageout_mutex instead of cpu_stat_lock
  854                  */
  855                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
  856 
  857                 /*
  858                  * Don't include ineligible pages in the number scanned.
  859                  */
  860                 if (rvfront != -1 || rvback != -1)
  861                         nscan++;
  862 
  863                 backhand = page_next(backhand);
  864 
  865                 /*
  866                  * backhand update and wraparound check are done separately
  867                  * because lint barks when it finds an empty "if" body
  868                  */
  869 
  870                 if ((fronthand = page_next(fronthand)) == page_first()) {
  871                         TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
  872                             "pageout_hand_wrap:freemem %ld whichhand %d",
  873                             freemem, FRONT);
  874 
  875                         /*
  876                          * protected by pageout_mutex instead of cpu_stat_lock
  877                          */
  878                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
  879                         if (++count > 1) {
  880                                 /*
  881                                  * Extremely unlikely, but it happens.
  882                                  * We went around the loop at least once
  883                                  * and didn't get far enough.
  884                                  * If we are still skipping `highly shared'
  885                                  * pages, skip fewer of them.  Otherwise,
  886                                  * give up till the next clock tick.
  887                                  */
  888                                 if (po_share < MAX_PO_SHARE) {
  889                                         po_share <<= 1;
  890                                 } else {
  891                                         /*
  892                                          * Really a "goto loop", but
  893                                          * if someone is TRACing or
  894                                          * TNF_PROBE_ing, at least
  895                                          * make records to show
  896                                          * where we are.
  897                                          */
  898                                         break;
  899                                 }
  900                         }
  901                 }
  902         }
  903 
  904         sample_end = gethrtime();
  905 
  906         TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
  907             "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
  908             freemem, lotsfree, nscan, desscan, count);
  909 
  910         /* Kernel probe */
  911         TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
  912             tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
  913 
  914         if (pageout_sample_cnt < pageout_sample_lim) {
  915                 pageout_sample_pages += pcount;
  916                 pageout_sample_etime += sample_end - sample_start;
  917                 ++pageout_sample_cnt;
  918         }
  919         if (pageout_sample_cnt >= pageout_sample_lim &&
  920             pageout_new_spread == 0) {
  921                 pageout_rate = (hrrate_t)pageout_sample_pages *
  922                     (hrrate_t)(NANOSEC) / pageout_sample_etime;
  923                 pageout_new_spread = pageout_rate / 10;
  924                 setupclock(1);
  925         }
  926 
  927         goto loop;
  928 }
  929 
  930 /*
  931  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
  932  * system (u., page table) or free, then leave it alone.  Otherwise,
  933  * if we are running the front hand, turn off the page's reference bit.
  934  * If the proc is over maxrss, we take it.  If running the back hand,
  935  * check whether the page has been reclaimed.  If not, free the page,
  936  * pushing it to disk first if necessary.
  937  *
  938  * Return values:
  939  *      -1 if the page is not a candidate at all,
  940  *       0 if not freed, or
  941  *       1 if we freed it.
  942  */
  943 static int
  944 checkpage(struct page *pp, int whichhand)
  945 {
  946         int ppattr;
  947         int isfs = 0;
  948         int isexec = 0;
  949         int pagesync_flag;
  950 
  951         /*
  952          * Skip pages:
  953          *      - associated with the kernel vnode since
  954          *          they are always "exclusively" locked.
  955          *      - that are free
  956          *      - that are shared more than po_share'd times
  957          *      - its already locked
  958          *
  959          * NOTE:  These optimizations assume that reads are atomic.
  960          */
  961 
  962         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
  963             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
  964             hat_page_checkshare(pp, po_share)) {
  965                 return (-1);
  966         }
  967 
  968         if (!page_trylock(pp, SE_EXCL)) {
  969                 /*
  970                  * Skip the page if we can't acquire the "exclusive" lock.
  971                  */
  972                 return (-1);
  973         } else if (PP_ISFREE(pp)) {
  974                 /*
  975                  * It became free between the above check and our actually
  976                  * locking the page.  Oh, well there will be other pages.
  977                  */
  978                 page_unlock(pp);
  979                 return (-1);
  980         }
  981 
  982         /*
  983          * Reject pages that cannot be freed. The page_struct_lock
  984          * need not be acquired to examine these
  985          * fields since the page has an "exclusive" lock.
  986          */
  987         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
  988                 page_unlock(pp);
  989                 return (-1);
  990         }
  991 
  992         /*
  993          * Maintain statistics for what we are freeing
  994          */
  995 
  996         if (pp->p_vnode != NULL) {
  997                 if (pp->p_vnode->v_flag & VVMEXEC)
  998                         isexec = 1;
  999 
 1000                 if (!IS_SWAPFSVP(pp->p_vnode))
 1001                         isfs = 1;
 1002         }
 1003 
 1004         /*
 1005          * Turn off REF and MOD bits with the front hand.
 1006          * The back hand examines the REF bit and always considers
 1007          * SHARED pages as referenced.
 1008          */
 1009         if (whichhand == FRONT)
 1010                 pagesync_flag = HAT_SYNC_ZERORM;
 1011         else
 1012                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
 1013                     HAT_SYNC_STOPON_SHARED;
 1014 
 1015         ppattr = hat_pagesync(pp, pagesync_flag);
 1016 
 1017 recheck:
 1018         /*
 1019          * If page is referenced; make unreferenced but reclaimable.
 1020          * If this page is not referenced, then it must be reclaimable
 1021          * and we can add it to the free list.
 1022          */
 1023         if (ppattr & P_REF) {
 1024                 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
 1025                     "pageout_isref:pp %p whichhand %d", pp, whichhand);
 1026                 if (whichhand == FRONT) {
 1027                         /*
 1028                          * Checking of rss or madvise flags needed here...
 1029                          *
 1030                          * If not "well-behaved", fall through into the code
 1031                          * for not referenced.
 1032                          */
 1033                         hat_clrref(pp);
 1034                 }
 1035                 /*
 1036                  * Somebody referenced the page since the front
 1037                  * hand went by, so it's not a candidate for
 1038                  * freeing up.
 1039                  */
 1040                 page_unlock(pp);
 1041                 return (0);
 1042         }
 1043 
 1044         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
 1045 
 1046         /*
 1047          * If large page, attempt to demote it. If successfully demoted,
 1048          * retry the checkpage.
 1049          */
 1050         if (pp->p_szc != 0) {
 1051                 if (!page_try_demote_pages(pp)) {
 1052                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
 1053                         page_unlock(pp);
 1054                         return (-1);
 1055                 }
 1056                 ASSERT(pp->p_szc == 0);
 1057                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
 1058                 /*
 1059                  * since page_try_demote_pages() could have unloaded some
 1060                  * mappings it makes sense to reload ppattr.
 1061                  */
 1062                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
 1063         }
 1064 
 1065         /*
 1066          * If the page is currently dirty, we have to arrange
 1067          * to have it cleaned before it can be freed.
 1068          *
 1069          * XXX - ASSERT(pp->p_vnode != NULL);
 1070          */
 1071         if ((ppattr & P_MOD) && pp->p_vnode) {
 1072                 struct vnode *vp = pp->p_vnode;
 1073                 u_offset_t offset = pp->p_offset;
 1074 
 1075                 /*
 1076                  * XXX - Test for process being swapped out or about to exit?
 1077                  * [Can't get back to process(es) using the page.]
 1078                  */
 1079 
 1080                 /*
 1081                  * Hold the vnode before releasing the page lock to
 1082                  * prevent it from being freed and re-used by some
 1083                  * other thread.
 1084                  */
 1085                 VN_HOLD(vp);
 1086                 page_unlock(pp);
 1087 
 1088                 /*
 1089                  * Queue i/o request for the pageout thread.
 1090                  */
 1091                 if (!queue_io_request(vp, offset)) {
 1092                         VN_RELE(vp);
 1093                         return (0);
 1094                 }
 1095                 return (1);
 1096         }
 1097 
 1098         /*
 1099          * Now we unload all the translations,
 1100          * and put the page back on to the free list.
 1101          * If the page was used (referenced or modified) after
 1102          * the pagesync but before it was unloaded we catch it
 1103          * and handle the page properly.
 1104          */
 1105         TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
 1106             "pageout_free:pp %p whichhand %d", pp, whichhand);
 1107         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 1108         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
 1109         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
 1110                 goto recheck;
 1111 
 1112         /*LINTED: constant in conditional context*/
 1113         VN_DISPOSE(pp, B_FREE, 0, kcred);
 1114 
 1115         CPU_STATS_ADD_K(vm, dfree, 1);
 1116 
 1117         if (isfs) {
 1118                 if (isexec) {
 1119                         CPU_STATS_ADD_K(vm, execfree, 1);
 1120                 } else {
 1121                         CPU_STATS_ADD_K(vm, fsfree, 1);
 1122                 }
 1123         } else {
 1124                 CPU_STATS_ADD_K(vm, anonfree, 1);
 1125         }
 1126 
 1127         return (1);             /* freed a page! */
 1128 }
 1129 
 1130 /*
 1131  * Queue async i/o request from pageout_scanner and segment swapout
 1132  * routines on one common list.  This ensures that pageout devices (swap)
 1133  * are not saturated by pageout_scanner or swapout requests.
 1134  * The pageout thread empties this list by initiating i/o operations.
 1135  */
 1136 int
 1137 queue_io_request(vnode_t *vp, u_offset_t off)
 1138 {
 1139         struct async_reqs *arg;
 1140 
 1141         /*
 1142          * If we cannot allocate an async request struct,
 1143          * skip this page.
 1144          */
 1145         mutex_enter(&push_lock);
 1146         if ((arg = req_freelist) == NULL) {
 1147                 mutex_exit(&push_lock);
 1148                 return (0);
 1149         }
 1150         req_freelist = arg->a_next;             /* adjust freelist */
 1151         push_list_size++;
 1152 
 1153         arg->a_vp = vp;
 1154         arg->a_off = off;
 1155         arg->a_len = PAGESIZE;
 1156         arg->a_flags = B_ASYNC | B_FREE;
 1157         arg->a_cred = kcred;            /* always held */
 1158 
 1159         /*
 1160          * Add to list of pending write requests.
 1161          */
 1162         arg->a_next = push_list;
 1163         push_list = arg;
 1164 
 1165         if (req_freelist == NULL) {
 1166                 /*
 1167                  * No free async requests left. The lock is held so we
 1168                  * might as well signal the pusher thread now.
 1169                  */
 1170                 cv_signal(&push_cv);
 1171         }
 1172         mutex_exit(&push_lock);
 1173         return (1);
 1174 }
 1175 
 1176 /*
 1177  * Wakeup pageout to initiate i/o if push_list is not empty.
 1178  */
 1179 void
 1180 cv_signal_pageout()
 1181 {
 1182         if (push_list != NULL) {
 1183                 mutex_enter(&push_lock);
 1184                 cv_signal(&push_cv);
 1185                 mutex_exit(&push_lock);
 1186         }
 1187 }

Cache object: 90c6990f023b57e8931814d80c56c35e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.