The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_swapcache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * (MPSAFE)
    3  *
    4  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
    5  *
    6  * This code is derived from software contributed to The DragonFly Project
    7  * by Matthew Dillon <dillon@backplane.com>
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  *
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in
   17  *    the documentation and/or other materials provided with the
   18  *    distribution.
   19  * 3. Neither the name of The DragonFly Project nor the names of its
   20  *    contributors may be used to endorse or promote products derived
   21  *    from this software without specific, prior written permission.
   22  *
   23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  */
   36 
   37 /*
   38  * Implement the swapcache daemon.  When enabled swap is assumed to be
   39  * configured on a fast storage device such as a SSD.  Swap is assigned
   40  * to clean vnode-backed pages in the inactive queue, clustered by object
   41  * if possible, and written out.  The swap assignment sticks around even
   42  * after the underlying pages have been recycled.
   43  *
   44  * The daemon manages write bandwidth based on sysctl settings to control
   45  * wear on the SSD.
   46  *
   47  * The vnode strategy code will check for the swap assignments and divert
   48  * reads to the swap device when the data is present in the swapcache.
   49  *
   50  * This operates on both regular files and the block device vnodes used by
   51  * filesystems to manage meta-data.
   52  */
   53 
   54 #include "opt_vm.h"
   55 #include <sys/param.h>
   56 #include <sys/systm.h>
   57 #include <sys/kernel.h>
   58 #include <sys/proc.h>
   59 #include <sys/kthread.h>
   60 #include <sys/resourcevar.h>
   61 #include <sys/signalvar.h>
   62 #include <sys/vnode.h>
   63 #include <sys/vmmeter.h>
   64 #include <sys/sysctl.h>
   65 #include <sys/eventhandler.h>
   66 
   67 #include <vm/vm.h>
   68 #include <vm/vm_param.h>
   69 #include <sys/lock.h>
   70 #include <vm/vm_object.h>
   71 #include <vm/vm_page.h>
   72 #include <vm/vm_map.h>
   73 #include <vm/vm_pageout.h>
   74 #include <vm/vm_pager.h>
   75 #include <vm/swap_pager.h>
   76 #include <vm/vm_extern.h>
   77 
   78 #include <sys/thread2.h>
   79 #include <sys/spinlock2.h>
   80 #include <vm/vm_page2.h>
   81 
   82 /* the kernel process "vm_pageout"*/
   83 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
   84 static int vm_swapcache_test(vm_page_t m);
   85 static int vm_swapcache_writing_heuristic(void);
   86 static int vm_swapcache_writing(vm_page_t marker, int count, int scount);
   87 static void vm_swapcache_cleaning(vm_object_t marker, int *swindexp);
   88 static void vm_swapcache_movemarker(vm_object_t marker, int swindex,
   89                                 vm_object_t object);
   90 struct thread *swapcached_thread;
   91 
   92 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
   93 
   94 int vm_swapcache_read_enable;
   95 int vm_swapcache_inactive_heuristic;
   96 static int vm_swapcache_sleep;
   97 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8;
   98 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4;
   99 static int vm_swapcache_data_enable = 0;
  100 static int vm_swapcache_meta_enable = 0;
  101 static int vm_swapcache_maxswappct = 75;
  102 static int vm_swapcache_hysteresis;
  103 static int vm_swapcache_min_hysteresis;
  104 int vm_swapcache_use_chflags = 1;       /* require chflags cache */
  105 static int64_t vm_swapcache_minburst = 10000000LL;      /* 10MB */
  106 static int64_t vm_swapcache_curburst = 4000000000LL;    /* 4G after boot */
  107 static int64_t vm_swapcache_maxburst = 2000000000LL;    /* 2G nominal max */
  108 static int64_t vm_swapcache_accrate = 100000LL;         /* 100K/s */
  109 static int64_t vm_swapcache_write_count;
  110 static int64_t vm_swapcache_maxfilesize;
  111 static int64_t vm_swapcache_cleanperobj = 16*1024*1024;
  112 
  113 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
  114         CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
  115 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan,
  116         CTLFLAG_RW, &vm_swapcache_maxscan, 0, "");
  117 
  118 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
  119         CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
  120 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
  121         CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
  122 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
  123         CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
  124 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
  125         CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
  126 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
  127         CTLFLAG_RD, &vm_swapcache_hysteresis, 0, "");
  128 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis,
  129         CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, "");
  130 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
  131         CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
  132 
  133 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
  134         CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
  135 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
  136         CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
  137 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
  138         CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
  139 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
  140         CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
  141 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
  142         CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
  143 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
  144         CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
  145 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj,
  146         CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, "");
  147 
  148 #define SWAPMAX(adj)    \
  149         ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
  150 
  151 /*
  152  * When shutting down the machine we want to stop swapcache operation
  153  * immediately so swap is not accessed after devices have been shuttered.
  154  */
  155 static void
  156 shutdown_swapcache(void *arg __unused)
  157 {
  158         vm_swapcache_read_enable = 0;
  159         vm_swapcache_data_enable = 0;
  160         vm_swapcache_meta_enable = 0;
  161         wakeup(&vm_swapcache_sleep);    /* shortcut 5-second wait */
  162 }
  163 
  164 /*
  165  * vm_swapcached is the high level pageout daemon.
  166  *
  167  * No requirements.
  168  */
  169 static void
  170 vm_swapcached_thread(void)
  171 {
  172         enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
  173         enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
  174         static struct vm_page page_marker[PQ_L2_SIZE];
  175         static struct vm_object swmarker;
  176         static int swindex;
  177         int q;
  178 
  179         /*
  180          * Thread setup
  181          */
  182         curthread->td_flags |= TDF_SYSTHREAD;
  183         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
  184                               swapcached_thread, SHUTDOWN_PRI_FIRST);
  185         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
  186                               NULL, SHUTDOWN_PRI_SECOND);
  187 
  188         /*
  189          * Initialize our marker for the inactive scan (SWAPC_WRITING)
  190          */
  191         bzero(&page_marker, sizeof(page_marker));
  192         for (q = 0; q < PQ_L2_SIZE; ++q) {
  193                 page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
  194                 page_marker[q].queue = PQ_INACTIVE + q;
  195                 page_marker[q].pc = q;
  196                 page_marker[q].wire_count = 1;
  197                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
  198                 TAILQ_INSERT_HEAD(
  199                         &vm_page_queues[PQ_INACTIVE + q].pl,
  200                         &page_marker[q], pageq);
  201                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
  202         }
  203 
  204         vm_swapcache_min_hysteresis = 1024;
  205         vm_swapcache_hysteresis = vm_swapcache_min_hysteresis;
  206         vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
  207 
  208         /*
  209          * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
  210          */
  211         bzero(&swmarker, sizeof(swmarker));
  212         swmarker.type = OBJT_MARKER;
  213         swindex = 0;
  214         lwkt_gettoken(&vmobj_tokens[swindex]);
  215         TAILQ_INSERT_HEAD(&vm_object_lists[swindex],
  216                           &swmarker, object_list);
  217         lwkt_reltoken(&vmobj_tokens[swindex]);
  218 
  219         for (;;) {
  220                 int reached_end;
  221                 int scount;
  222                 int count;
  223 
  224                 /*
  225                  * Handle shutdown
  226                  */
  227                 kproc_suspend_loop();
  228 
  229                 /*
  230                  * Check every 5 seconds when not enabled or if no swap
  231                  * is present.
  232                  */
  233                 if ((vm_swapcache_data_enable == 0 &&
  234                      vm_swapcache_meta_enable == 0) ||
  235                     vm_swap_max == 0) {
  236                         tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
  237                         continue;
  238                 }
  239 
  240                 /*
  241                  * Polling rate when enabled is approximately 10 hz.
  242                  */
  243                 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
  244 
  245                 /*
  246                  * State hysteresis.  Generate write activity up to 75% of
  247                  * swap, then clean out swap assignments down to 70%, then
  248                  * repeat.
  249                  */
  250                 if (state == SWAPC_WRITING) {
  251                         if (vm_swap_cache_use > SWAPMAX(0))
  252                                 state = SWAPC_CLEANING;
  253                 } else {
  254                         if (vm_swap_cache_use < SWAPMAX(-10))
  255                                 state = SWAPC_WRITING;
  256                 }
  257 
  258                 /*
  259                  * We are allowed to continue accumulating burst value
  260                  * in either state.  Allow the user to set curburst > maxburst
  261                  * for the initial load-in.
  262                  */
  263                 if (vm_swapcache_curburst < vm_swapcache_maxburst) {
  264                         vm_swapcache_curburst += vm_swapcache_accrate / 10;
  265                         if (vm_swapcache_curburst > vm_swapcache_maxburst)
  266                                 vm_swapcache_curburst = vm_swapcache_maxburst;
  267                 }
  268 
  269                 /*
  270                  * We don't want to nickle-and-dime the scan as that will
  271                  * create unnecessary fragmentation.  The minimum burst
  272                  * is one-seconds worth of accumulation.
  273                  */
  274                 if (state != SWAPC_WRITING) {
  275                         vm_swapcache_cleaning(&swmarker, &swindex);
  276                         continue;
  277                 }
  278                 if (vm_swapcache_curburst < vm_swapcache_accrate)
  279                         continue;
  280 
  281                 reached_end = 0;
  282                 count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2;
  283                 scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2;
  284 
  285                 if (burst == SWAPB_BURSTING) {
  286                         if (vm_swapcache_writing_heuristic()) {
  287                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
  288                                         reached_end +=
  289                                                 vm_swapcache_writing(
  290                                                         &page_marker[q],
  291                                                         count,
  292                                                         scount);
  293                                 }
  294                         }
  295                         if (vm_swapcache_curburst <= 0)
  296                                 burst = SWAPB_RECOVERING;
  297                 } else if (vm_swapcache_curburst > vm_swapcache_minburst) {
  298                         if (vm_swapcache_writing_heuristic()) {
  299                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
  300                                         reached_end +=
  301                                                 vm_swapcache_writing(
  302                                                         &page_marker[q],
  303                                                         count,
  304                                                         scount);
  305                                 }
  306                         }
  307                         burst = SWAPB_BURSTING;
  308                 }
  309                 if (reached_end == PQ_L2_SIZE) {
  310                         vm_swapcache_inactive_heuristic =
  311                                 -vm_swapcache_hysteresis;
  312                 }
  313         }
  314 
  315         /*
  316          * Cleanup (NOT REACHED)
  317          */
  318         for (q = 0; q < PQ_L2_SIZE; ++q) {
  319                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
  320                 TAILQ_REMOVE(
  321                         &vm_page_queues[PQ_INACTIVE + q].pl,
  322                         &page_marker[q], pageq);
  323                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
  324         }
  325 
  326         lwkt_gettoken(&vmobj_tokens[swindex]);
  327         TAILQ_REMOVE(&vm_object_lists[swindex], &swmarker, object_list);
  328         lwkt_reltoken(&vmobj_tokens[swindex]);
  329 }
  330 
  331 static struct kproc_desc swpc_kp = {
  332         "swapcached",
  333         vm_swapcached_thread,
  334         &swapcached_thread
  335 };
  336 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp)
  337 
  338 /*
  339  * Deal with an overflow of the heuristic counter or if the user
  340  * manually changes the hysteresis.
  341  *
  342  * Try to avoid small incremental pageouts by waiting for enough
  343  * pages to buildup in the inactive queue to hopefully get a good
  344  * burst in.  This heuristic is bumped by the VM system and reset
  345  * when our scan hits the end of the queue.
  346  *
  347  * Return TRUE if we need to take a writing pass.
  348  */
  349 static int
  350 vm_swapcache_writing_heuristic(void)
  351 {
  352         int hyst;
  353 
  354         hyst = vmstats.v_inactive_count / 4;
  355         if (hyst < vm_swapcache_min_hysteresis)
  356                 hyst = vm_swapcache_min_hysteresis;
  357         cpu_ccfence();
  358         vm_swapcache_hysteresis = hyst;
  359 
  360         if (vm_swapcache_inactive_heuristic < -hyst)
  361                 vm_swapcache_inactive_heuristic = -hyst;
  362 
  363         return (vm_swapcache_inactive_heuristic >= 0);
  364 }
  365 
  366 /*
  367  * Take a writing pass on one of the inactive queues, return non-zero if
  368  * we hit the end of the queue.
  369  */
  370 static int
  371 vm_swapcache_writing(vm_page_t marker, int count, int scount)
  372 {
  373         vm_object_t object;
  374         struct vnode *vp;
  375         vm_page_t m;
  376         int isblkdev;
  377 
  378         /*
  379          * Scan the inactive queue from our marker to locate
  380          * suitable pages to push to the swap cache.
  381          *
  382          * We are looking for clean vnode-backed pages.
  383          */
  384         vm_page_queues_spin_lock(marker->queue);
  385         while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
  386                count > 0 && scount-- > 0) {
  387                 KKASSERT(m->queue == marker->queue);
  388 
  389                 if (vm_swapcache_curburst < 0)
  390                         break;
  391                 TAILQ_REMOVE(
  392                         &vm_page_queues[marker->queue].pl, marker, pageq);
  393                 TAILQ_INSERT_AFTER(
  394                         &vm_page_queues[marker->queue].pl, m, marker, pageq);
  395 
  396                 /*
  397                  * Ignore markers and ignore pages that already have a swap
  398                  * assignment.
  399                  */
  400                 if (m->flags & (PG_MARKER | PG_SWAPPED))
  401                         continue;
  402                 if (vm_page_busy_try(m, TRUE))
  403                         continue;
  404                 vm_page_queues_spin_unlock(marker->queue);
  405 
  406                 if ((object = m->object) == NULL) {
  407                         vm_page_wakeup(m);
  408                         vm_page_queues_spin_lock(marker->queue);
  409                         continue;
  410                 }
  411                 vm_object_hold(object);
  412                 if (m->object != object) {
  413                         vm_object_drop(object);
  414                         vm_page_wakeup(m);
  415                         vm_page_queues_spin_lock(marker->queue);
  416                         continue;
  417                 }
  418                 if (vm_swapcache_test(m)) {
  419                         vm_object_drop(object);
  420                         vm_page_wakeup(m);
  421                         vm_page_queues_spin_lock(marker->queue);
  422                         continue;
  423                 }
  424 
  425                 vp = object->handle;
  426                 if (vp == NULL) {
  427                         vm_object_drop(object);
  428                         vm_page_wakeup(m);
  429                         vm_page_queues_spin_lock(marker->queue);
  430                         continue;
  431                 }
  432 
  433                 switch(vp->v_type) {
  434                 case VREG:
  435                         /*
  436                          * PG_NOTMETA generically means 'don't swapcache this',
  437                          * and HAMMER will set this for regular data buffers
  438                          * (and leave it unset for meta-data buffers) as
  439                          * appropriate when double buffering is enabled.
  440                          */
  441                         if (m->flags & PG_NOTMETA) {
  442                                 vm_object_drop(object);
  443                                 vm_page_wakeup(m);
  444                                 vm_page_queues_spin_lock(marker->queue);
  445                                 continue;
  446                         }
  447 
  448                         /*
  449                          * If data_enable is 0 do not try to swapcache data.
  450                          * If use_chflags is set then only swapcache data for
  451                          * VSWAPCACHE marked vnodes, otherwise any vnode.
  452                          */
  453                         if (vm_swapcache_data_enable == 0 ||
  454                             ((vp->v_flag & VSWAPCACHE) == 0 &&
  455                              vm_swapcache_use_chflags)) {
  456                                 vm_object_drop(object);
  457                                 vm_page_wakeup(m);
  458                                 vm_page_queues_spin_lock(marker->queue);
  459                                 continue;
  460                         }
  461                         if (vm_swapcache_maxfilesize &&
  462                             object->size >
  463                             (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
  464                                 vm_object_drop(object);
  465                                 vm_page_wakeup(m);
  466                                 vm_page_queues_spin_lock(marker->queue);
  467                                 continue;
  468                         }
  469                         isblkdev = 0;
  470                         break;
  471                 case VCHR:
  472                         /*
  473                          * PG_NOTMETA generically means 'don't swapcache this',
  474                          * and HAMMER will set this for regular data buffers
  475                          * (and leave it unset for meta-data buffers) as
  476                          * appropriate when double buffering is enabled.
  477                          */
  478                         if (m->flags & PG_NOTMETA) {
  479                                 vm_object_drop(object);
  480                                 vm_page_wakeup(m);
  481                                 vm_page_queues_spin_lock(marker->queue);
  482                                 continue;
  483                         }
  484                         if (vm_swapcache_meta_enable == 0) {
  485                                 vm_object_drop(object);
  486                                 vm_page_wakeup(m);
  487                                 vm_page_queues_spin_lock(marker->queue);
  488                                 continue;
  489                         }
  490                         isblkdev = 1;
  491                         break;
  492                 default:
  493                         vm_object_drop(object);
  494                         vm_page_wakeup(m);
  495                         vm_page_queues_spin_lock(marker->queue);
  496                         continue;
  497                 }
  498 
  499 
  500                 /*
  501                  * Assign swap and initiate I/O.
  502                  *
  503                  * (adjust for the --count which also occurs in the loop)
  504                  */
  505                 count -= vm_swapcached_flush(m, isblkdev);
  506 
  507                 /*
  508                  * Setup for next loop using marker.
  509                  */
  510                 vm_object_drop(object);
  511                 vm_page_queues_spin_lock(marker->queue);
  512         }
  513 
  514         /*
  515          * The marker could wind up at the end, which is ok.  If we hit the
  516          * end of the list adjust the heuristic.
  517          *
  518          * Earlier inactive pages that were dirty and become clean
  519          * are typically moved to the end of PQ_INACTIVE by virtue
  520          * of vfs_vmio_release() when they become unwired from the
  521          * buffer cache.
  522          */
  523         vm_page_queues_spin_unlock(marker->queue);
  524 
  525         /*
  526          * m invalid but can be used to test for NULL
  527          */
  528         return (m == NULL);
  529 }
  530 
  531 /*
  532  * Flush the specified page using the swap_pager.  The page
  533  * must be busied by the caller and its disposition will become
  534  * the responsibility of this function.
  535  *
  536  * Try to collect surrounding pages, including pages which may
  537  * have already been assigned swap.  Try to cluster within a
  538  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
  539  * to match what swap_pager_putpages() can do.
  540  *
  541  * We also want to try to match against the buffer cache blocksize
  542  * but we don't really know what it is here.  Since the buffer cache
  543  * wires and unwires pages in groups the fact that we skip wired pages
  544  * should be sufficient.
  545  *
  546  * Returns a count of pages we might have flushed (minimum 1)
  547  */
  548 static
  549 int
  550 vm_swapcached_flush(vm_page_t m, int isblkdev)
  551 {
  552         vm_object_t object;
  553         vm_page_t marray[SWAP_META_PAGES];
  554         vm_pindex_t basei;
  555         int rtvals[SWAP_META_PAGES];
  556         int x;
  557         int i;
  558         int j;
  559         int count;
  560         int error;
  561 
  562         vm_page_io_start(m);
  563         vm_page_protect(m, VM_PROT_READ);
  564         object = m->object;
  565         vm_object_hold(object);
  566 
  567         /*
  568          * Try to cluster around (m), keeping in mind that the swap pager
  569          * can only do SMAP_META_PAGES worth of continguous write.
  570          */
  571         x = (int)m->pindex & SWAP_META_MASK;
  572         marray[x] = m;
  573         basei = m->pindex;
  574         vm_page_wakeup(m);
  575 
  576         for (i = x - 1; i >= 0; --i) {
  577                 m = vm_page_lookup_busy_try(object, basei - x + i,
  578                                             TRUE, &error);
  579                 if (error || m == NULL)
  580                         break;
  581                 if (vm_swapcache_test(m)) {
  582                         vm_page_wakeup(m);
  583                         break;
  584                 }
  585                 if (isblkdev && (m->flags & PG_NOTMETA)) {
  586                         vm_page_wakeup(m);
  587                         break;
  588                 }
  589                 vm_page_io_start(m);
  590                 vm_page_protect(m, VM_PROT_READ);
  591                 if (m->queue - m->pc == PQ_CACHE) {
  592                         vm_page_unqueue_nowakeup(m);
  593                         vm_page_deactivate(m);
  594                 }
  595                 marray[i] = m;
  596                 vm_page_wakeup(m);
  597         }
  598         ++i;
  599 
  600         for (j = x + 1; j < SWAP_META_PAGES; ++j) {
  601                 m = vm_page_lookup_busy_try(object, basei - x + j,
  602                                             TRUE, &error);
  603                 if (error || m == NULL)
  604                         break;
  605                 if (vm_swapcache_test(m)) {
  606                         vm_page_wakeup(m);
  607                         break;
  608                 }
  609                 if (isblkdev && (m->flags & PG_NOTMETA)) {
  610                         vm_page_wakeup(m);
  611                         break;
  612                 }
  613                 vm_page_io_start(m);
  614                 vm_page_protect(m, VM_PROT_READ);
  615                 if (m->queue - m->pc == PQ_CACHE) {
  616                         vm_page_unqueue_nowakeup(m);
  617                         vm_page_deactivate(m);
  618                 }
  619                 marray[j] = m;
  620                 vm_page_wakeup(m);
  621         }
  622 
  623         count = j - i;
  624         vm_object_pip_add(object, count);
  625         swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
  626         vm_swapcache_write_count += count * PAGE_SIZE;
  627         vm_swapcache_curburst -= count * PAGE_SIZE;
  628 
  629         while (i < j) {
  630                 if (rtvals[i] != VM_PAGER_PEND) {
  631                         vm_page_busy_wait(marray[i], FALSE, "swppgfd");
  632                         vm_page_io_finish(marray[i]);
  633                         vm_page_wakeup(marray[i]);
  634                         vm_object_pip_wakeup(object);
  635                 }
  636                 ++i;
  637         }
  638         vm_object_drop(object);
  639         return(count);
  640 }
  641 
  642 /*
  643  * Test whether a VM page is suitable for writing to the swapcache.
  644  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
  645  *
  646  * Returns 0 on success, 1 on failure
  647  */
  648 static int
  649 vm_swapcache_test(vm_page_t m)
  650 {
  651         vm_object_t object;
  652 
  653         if (m->flags & PG_UNMANAGED)
  654                 return(1);
  655         if (m->hold_count || m->wire_count)
  656                 return(1);
  657         if (m->valid != VM_PAGE_BITS_ALL)
  658                 return(1);
  659         if (m->dirty & m->valid)
  660                 return(1);
  661         if ((object = m->object) == NULL)
  662                 return(1);
  663         if (object->type != OBJT_VNODE ||
  664             (object->flags & OBJ_DEAD)) {
  665                 return(1);
  666         }
  667         vm_page_test_dirty(m);
  668         if (m->dirty & m->valid)
  669                 return(1);
  670         return(0);
  671 }
  672 
  673 /*
  674  * Cleaning pass.
  675  *
  676  * We clean whole objects up to 16MB
  677  */
  678 static
  679 void
  680 vm_swapcache_cleaning(vm_object_t marker, int *swindexp)
  681 {
  682         vm_object_t object;
  683         struct vnode *vp;
  684         int count;
  685         int scount;
  686         int n;
  687 
  688         count = vm_swapcache_maxlaunder;
  689         scount = vm_swapcache_maxscan;
  690 
  691 outerloop:
  692         /*
  693          * Look for vnode objects
  694          */
  695         lwkt_gettoken(&vmobj_tokens[*swindexp]);
  696 
  697         while ((object = TAILQ_NEXT(marker, object_list)) != NULL) {
  698                 /*
  699                  * We have to skip markers.  We cannot hold/drop marker
  700                  * objects!
  701                  */
  702                 if (object->type == OBJT_MARKER) {
  703                         vm_swapcache_movemarker(marker, *swindexp, object);
  704                         continue;
  705                 }
  706 
  707                 /*
  708                  * Safety, or in case there are millions of VM objects
  709                  * without swapcache backing.
  710                  */
  711                 if (--scount <= 0)
  712                         goto breakout;
  713 
  714                 /*
  715                  * We must hold the object before potentially yielding.
  716                  */
  717                 vm_object_hold(object);
  718                 lwkt_yield();
  719 
  720                 /* 
  721                  * Only operate on live VNODE objects that are either
  722                  * VREG or VCHR (VCHR for meta-data).
  723                  */
  724                 if ((object->type != OBJT_VNODE) ||
  725                     ((object->flags & OBJ_DEAD) ||
  726                      object->swblock_count == 0) ||
  727                     ((vp = object->handle) == NULL) ||
  728                     (vp->v_type != VREG && vp->v_type != VCHR)) {
  729                         vm_object_drop(object);
  730                         /* object may be invalid now */
  731                         vm_swapcache_movemarker(marker, *swindexp, object);
  732                         continue;
  733                 }
  734 
  735                 /*
  736                  * Reset the object pindex stored in the marker if the
  737                  * working object has changed.
  738                  */
  739                 if (marker->backing_object != object) {
  740                         marker->size = 0;
  741                         marker->backing_object_offset = 0;
  742                         marker->backing_object = object;
  743                 }
  744 
  745                 /*
  746                  * Look for swblocks starting at our iterator.
  747                  *
  748                  * The swap_pager_condfree() function attempts to free
  749                  * swap space starting at the specified index.  The index
  750                  * will be updated on return.  The function will return
  751                  * a scan factor (NOT the number of blocks freed).
  752                  *
  753                  * If it must cut its scan of the object short due to an
  754                  * excessive number of swblocks, or is able to free the
  755                  * requested number of blocks, it will return n >= count
  756                  * and we break and pick it back up on a future attempt.
  757                  *
  758                  * Scan the object linearly and try to batch large sets of
  759                  * blocks that are likely to clean out entire swap radix
  760                  * tree leafs.
  761                  */
  762                 lwkt_token_swap();
  763                 lwkt_reltoken(&vmobj_tokens[*swindexp]);
  764 
  765                 n = swap_pager_condfree(object, &marker->size,
  766                                     (count + SWAP_META_MASK) & ~SWAP_META_MASK);
  767 
  768                 vm_object_drop(object);         /* object may be invalid now */
  769                 lwkt_gettoken(&vmobj_tokens[*swindexp]);
  770 
  771                 /*
  772                  * If we have exhausted the object or deleted our per-pass
  773                  * page limit then move us to the next object.  Note that
  774                  * the current object may no longer be on the vm_object_list.
  775                  */
  776                 if (n <= 0 ||
  777                     marker->backing_object_offset > vm_swapcache_cleanperobj) {
  778                         vm_swapcache_movemarker(marker, *swindexp, object);
  779                 }
  780 
  781                 /*
  782                  * If we have exhausted our max-launder stop for now.
  783                  */
  784                 count -= n;
  785                 marker->backing_object_offset += n * PAGE_SIZE;
  786                 if (count < 0)
  787                         goto breakout;
  788         }
  789 
  790         /*
  791          * Iterate vm_object_lists[] hash table
  792          */
  793         TAILQ_REMOVE(&vm_object_lists[*swindexp], marker, object_list);
  794         lwkt_reltoken(&vmobj_tokens[*swindexp]);
  795         if (++*swindexp >= VMOBJ_HSIZE)
  796                 *swindexp = 0;
  797         lwkt_gettoken(&vmobj_tokens[*swindexp]);
  798         TAILQ_INSERT_HEAD(&vm_object_lists[*swindexp], marker, object_list);
  799 
  800         if (*swindexp != 0)
  801                 goto outerloop;
  802 
  803 breakout:
  804         lwkt_reltoken(&vmobj_tokens[*swindexp]);
  805 }
  806 
  807 /*
  808  * Move the marker past the current object.  Object can be stale, but we
  809  * still need it to determine if the marker has to be moved.  If the object
  810  * is still the 'current object' (object after the marker), we hop-scotch
  811  * the marker past it.
  812  */
  813 static void
  814 vm_swapcache_movemarker(vm_object_t marker, int swindex, vm_object_t object)
  815 {
  816         if (TAILQ_NEXT(marker, object_list) == object) {
  817                 TAILQ_REMOVE(&vm_object_lists[swindex], marker, object_list);
  818                 TAILQ_INSERT_AFTER(&vm_object_lists[swindex], object,
  819                                    marker, object_list);
  820         }
  821 }

Cache object: 8b1aca9d617f38ffd189f57b823ddd6b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.