The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/swap_pager.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1998 Matthew Dillon,
    3  * Copyright (c) 1994 John S. Dyson
    4  * Copyright (c) 1990 University of Utah.
    5  * Copyright (c) 1982, 1986, 1989, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * the Systems Programming Group of the University of Utah Computer
   10  * Science Department.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. All advertising materials mentioning features or use of this software
   21  *    must display the following acknowledgement:
   22  *      This product includes software developed by the University of
   23  *      California, Berkeley and its contributors.
   24  * 4. Neither the name of the University nor the names of its contributors
   25  *    may be used to endorse or promote products derived from this software
   26  *    without specific prior written permission.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   38  * SUCH DAMAGE.
   39  *
   40  *                              New Swap System
   41  *                              Matthew Dillon
   42  *
   43  * Radix Bitmap 'blists'.
   44  *
   45  *      - The new swapper uses the new radix bitmap code.  This should scale
   46  *        to arbitrarily small or arbitrarily large swap spaces and an almost
   47  *        arbitrary degree of fragmentation.
   48  *
   49  * Features:
   50  *
   51  *      - on the fly reallocation of swap during putpages.  The new system
   52  *        does not try to keep previously allocated swap blocks for dirty
   53  *        pages.  
   54  *
   55  *      - on the fly deallocation of swap
   56  *
   57  *      - No more garbage collection required.  Unnecessarily allocated swap
   58  *        blocks only exist for dirty vm_page_t's now and these are already
   59  *        cycled (in a high-load system) by the pager.  We also do on-the-fly
   60  *        removal of invalidated swap blocks when a page is destroyed
   61  *        or renamed.
   62  *
   63  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
   64  *
   65  *      @(#)swap_pager.c        8.9 (Berkeley) 3/21/94
   66  *      @(#)vm_swap.c   8.5 (Berkeley) 2/17/94
   67  */
   68 
   69 #include <sys/cdefs.h>
   70 __FBSDID("$FreeBSD$");
   71 
   72 #include "opt_mac.h"
   73 #include "opt_swap.h"
   74 #include "opt_vm.h"
   75 
   76 #include <sys/param.h>
   77 #include <sys/systm.h>
   78 #include <sys/conf.h>
   79 #include <sys/kernel.h>
   80 #include <sys/proc.h>
   81 #include <sys/bio.h>
   82 #include <sys/buf.h>
   83 #include <sys/disk.h>
   84 #include <sys/fcntl.h>
   85 #include <sys/mount.h>
   86 #include <sys/namei.h>
   87 #include <sys/vnode.h>
   88 #include <sys/mac.h>
   89 #include <sys/malloc.h>
   90 #include <sys/sysctl.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/blist.h>
   93 #include <sys/lock.h>
   94 #include <sys/sx.h>
   95 #include <sys/vmmeter.h>
   96 
   97 #include <vm/vm.h>
   98 #include <vm/pmap.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_kern.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_page.h>
  103 #include <vm/vm_pager.h>
  104 #include <vm/vm_pageout.h>
  105 #include <vm/vm_param.h>
  106 #include <vm/swap_pager.h>
  107 #include <vm/vm_extern.h>
  108 #include <vm/uma.h>
  109 
  110 #include <geom/geom.h>
  111 
  112 /*
  113  * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, or 16
  114  * pages per allocation.  We recommend you stick with the default of 8.
  115  * The 16-page limit is due to the radix code (kern/subr_blist.c).
  116  */
  117 #ifndef MAX_PAGEOUT_CLUSTER
  118 #define MAX_PAGEOUT_CLUSTER 16
  119 #endif
  120 
  121 #if !defined(SWB_NPAGES)
  122 #define SWB_NPAGES      MAX_PAGEOUT_CLUSTER
  123 #endif
  124 
  125 /*
  126  * Piecemeal swap metadata structure.  Swap is stored in a radix tree.
  127  *
  128  * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix
  129  * is basically 8.  Assuming PAGE_SIZE == 4096, one tree level represents
  130  * 32K worth of data, two levels represent 256K, three levels represent
  131  * 2 MBytes.   This is acceptable.
  132  *
  133  * Overall memory utilization is about the same as the old swap structure.
  134  */
  135 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
  136 #define SWAP_META_PAGES         (SWB_NPAGES * 2)
  137 #define SWAP_META_MASK          (SWAP_META_PAGES - 1)
  138 
  139 typedef int32_t swblk_t;        /*
  140                                  * swap offset.  This is the type used to
  141                                  * address the "virtual swap device" and
  142                                  * therefore the maximum swap space is
  143                                  * 2^32 pages.
  144                                  */
  145 
  146 struct swdevt;
  147 typedef void sw_strategy_t(struct buf *bp, struct swdevt *sw);
  148 typedef void sw_close_t(struct thread *td, struct swdevt *sw);
  149 
  150 /*
  151  * Swap device table
  152  */
  153 struct swdevt {
  154         int     sw_flags;
  155         int     sw_nblks;
  156         int     sw_used;
  157         dev_t   sw_dev;
  158         struct vnode *sw_vp;
  159         void    *sw_id;
  160         swblk_t sw_first;
  161         swblk_t sw_end;
  162         struct blist *sw_blist;
  163         TAILQ_ENTRY(swdevt)     sw_list;
  164         sw_strategy_t           *sw_strategy;
  165         sw_close_t              *sw_close;
  166 };
  167 
  168 #define SW_CLOSING      0x04
  169 
  170 struct swblock {
  171         struct swblock  *swb_hnext;
  172         vm_object_t     swb_object;
  173         vm_pindex_t     swb_index;
  174         int             swb_count;
  175         daddr_t         swb_pages[SWAP_META_PAGES];
  176 };
  177 
  178 static struct mtx sw_dev_mtx;
  179 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
  180 static struct swdevt *swdevhd;  /* Allocate from here next */
  181 static int nswapdev;            /* Number of swap devices */
  182 int swap_pager_avail;
  183 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
  184 
  185 static void swapdev_strategy(struct buf *, struct swdevt *sw);
  186 
  187 #define SWM_FREE        0x02    /* free, period                 */
  188 #define SWM_POP         0x04    /* pop out                      */
  189 
  190 int swap_pager_full = 2;        /* swap space exhaustion (task killing) */
  191 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
  192 static int nsw_rcount;          /* free read buffers                    */
  193 static int nsw_wcount_sync;     /* limit write buffers / synchronous    */
  194 static int nsw_wcount_async;    /* limit write buffers / asynchronous   */
  195 static int nsw_wcount_async_max;/* assigned maximum                     */
  196 static int nsw_cluster_max;     /* maximum VOP I/O allowed              */
  197 
  198 static struct swblock **swhash;
  199 static int swhash_mask;
  200 static struct mtx swhash_mtx;
  201 
  202 static int swap_async_max = 4;  /* maximum in-progress async I/O's      */
  203 static struct sx sw_alloc_sx;
  204 
  205 
  206 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
  207         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
  208 
  209 /*
  210  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  211  * of searching a named list by hashing it just a little.
  212  */
  213 
  214 #define NOBJLISTS               8
  215 
  216 #define NOBJLIST(handle)        \
  217         (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
  218 
  219 static struct mtx sw_alloc_mtx; /* protect list manipulation */ 
  220 static struct pagerlst  swap_pager_object_list[NOBJLISTS];
  221 static uma_zone_t       swap_zone;
  222 static struct vm_object swap_zone_obj;
  223 
  224 /*
  225  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  226  * calls hooked from other parts of the VM system and do not appear here.
  227  * (see vm/swap_pager.h).
  228  */
  229 static vm_object_t
  230                 swap_pager_alloc(void *handle, vm_ooffset_t size,
  231                                       vm_prot_t prot, vm_ooffset_t offset);
  232 static void     swap_pager_dealloc(vm_object_t object);
  233 static int      swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
  234 static void     swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
  235 static boolean_t
  236                 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
  237 static void     swap_pager_init(void);
  238 static void     swap_pager_unswapped(vm_page_t);
  239 static void     swap_pager_swapoff(struct swdevt *sp);
  240 
  241 struct pagerops swappagerops = {
  242         .pgo_init =     swap_pager_init,        /* early system initialization of pager */
  243         .pgo_alloc =    swap_pager_alloc,       /* allocate an OBJT_SWAP object         */
  244         .pgo_dealloc =  swap_pager_dealloc,     /* deallocate an OBJT_SWAP object       */
  245         .pgo_getpages = swap_pager_getpages,    /* pagein                               */
  246         .pgo_putpages = swap_pager_putpages,    /* pageout                              */
  247         .pgo_haspage =  swap_pager_haspage,     /* get backing store status for page    */
  248         .pgo_pageunswapped = swap_pager_unswapped,      /* remove swap related to page          */
  249 };
  250 
  251 /*
  252  * dmmax is in page-sized chunks with the new swap system.  It was
  253  * dev-bsized chunks in the old.  dmmax is always a power of 2.
  254  *
  255  * swap_*() routines are externally accessible.  swp_*() routines are
  256  * internal.
  257  */
  258 static int dmmax;
  259 static int nswap_lowat = 128;   /* in pages, swap_pager_almost_full warn */
  260 static int nswap_hiwat = 512;   /* in pages, swap_pager_almost_full warn */
  261 
  262 SYSCTL_INT(_vm, OID_AUTO, dmmax,
  263         CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
  264 
  265 static void     swp_sizecheck(void);
  266 static void     swp_pager_async_iodone(struct buf *bp);
  267 static int      swapongeom(struct thread *, struct vnode *);
  268 static int      swaponvp(struct thread *, struct vnode *, u_long);
  269 
  270 /*
  271  * Swap bitmap functions
  272  */
  273 static void     swp_pager_freeswapspace(daddr_t blk, int npages);
  274 static daddr_t  swp_pager_getswapspace(int npages);
  275 
  276 /*
  277  * Metadata functions
  278  */
  279 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
  280 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
  281 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
  282 static void swp_pager_meta_free_all(vm_object_t);
  283 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
  284 
  285 /*
  286  * SWP_SIZECHECK() -    update swap_pager_full indication
  287  *      
  288  *      update the swap_pager_almost_full indication and warn when we are
  289  *      about to run out of swap space, using lowat/hiwat hysteresis.
  290  *
  291  *      Clear swap_pager_full ( task killing ) indication when lowat is met.
  292  *
  293  *      No restrictions on call
  294  *      This routine may not block.
  295  *      This routine must be called at splvm()
  296  */
  297 static void
  298 swp_sizecheck(void)
  299 {
  300 
  301         if (swap_pager_avail < nswap_lowat) {
  302                 if (swap_pager_almost_full == 0) {
  303                         printf("swap_pager: out of swap space\n");
  304                         swap_pager_almost_full = 1;
  305                 }
  306         } else {
  307                 swap_pager_full = 0;
  308                 if (swap_pager_avail > nswap_hiwat)
  309                         swap_pager_almost_full = 0;
  310         }
  311 }
  312 
  313 /*
  314  * SWP_PAGER_HASH() -   hash swap meta data
  315  *
  316  *      This is an helper function which hashes the swapblk given
  317  *      the object and page index.  It returns a pointer to a pointer
  318  *      to the object, or a pointer to a NULL pointer if it could not
  319  *      find a swapblk.
  320  *
  321  *      This routine must be called at splvm().
  322  */
  323 static struct swblock **
  324 swp_pager_hash(vm_object_t object, vm_pindex_t index)
  325 {
  326         struct swblock **pswap;
  327         struct swblock *swap;
  328 
  329         index &= ~(vm_pindex_t)SWAP_META_MASK;
  330         pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
  331         while ((swap = *pswap) != NULL) {
  332                 if (swap->swb_object == object &&
  333                     swap->swb_index == index
  334                 ) {
  335                         break;
  336                 }
  337                 pswap = &swap->swb_hnext;
  338         }
  339         return (pswap);
  340 }
  341 
  342 /*
  343  * SWAP_PAGER_INIT() -  initialize the swap pager!
  344  *
  345  *      Expected to be started from system init.  NOTE:  This code is run 
  346  *      before much else so be careful what you depend on.  Most of the VM
  347  *      system has yet to be initialized at this point.
  348  */
  349 static void
  350 swap_pager_init(void)
  351 {
  352         /*
  353          * Initialize object lists
  354          */
  355         int i;
  356 
  357         for (i = 0; i < NOBJLISTS; ++i)
  358                 TAILQ_INIT(&swap_pager_object_list[i]);
  359         mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
  360         mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
  361 
  362         /*
  363          * Device Stripe, in PAGE_SIZE'd blocks
  364          */
  365         dmmax = SWB_NPAGES * 2;
  366 }
  367 
  368 /*
  369  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  370  *
  371  *      Expected to be started from pageout process once, prior to entering
  372  *      its main loop.
  373  */
  374 void
  375 swap_pager_swap_init(void)
  376 {
  377         int n, n2;
  378 
  379         /*
  380          * Number of in-transit swap bp operations.  Don't
  381          * exhaust the pbufs completely.  Make sure we
  382          * initialize workable values (0 will work for hysteresis
  383          * but it isn't very efficient).
  384          *
  385          * The nsw_cluster_max is constrained by the bp->b_pages[]
  386          * array (MAXPHYS/PAGE_SIZE) and our locally defined
  387          * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
  388          * constrained by the swap device interleave stripe size.
  389          *
  390          * Currently we hardwire nsw_wcount_async to 4.  This limit is 
  391          * designed to prevent other I/O from having high latencies due to
  392          * our pageout I/O.  The value 4 works well for one or two active swap
  393          * devices but is probably a little low if you have more.  Even so,
  394          * a higher value would probably generate only a limited improvement
  395          * with three or four active swap devices since the system does not
  396          * typically have to pageout at extreme bandwidths.   We will want
  397          * at least 2 per swap devices, and 4 is a pretty good value if you
  398          * have one NFS swap device due to the command/ack latency over NFS.
  399          * So it all works out pretty well.
  400          */
  401         nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
  402 
  403         mtx_lock(&pbuf_mtx);
  404         nsw_rcount = (nswbuf + 1) / 2;
  405         nsw_wcount_sync = (nswbuf + 3) / 4;
  406         nsw_wcount_async = 4;
  407         nsw_wcount_async_max = nsw_wcount_async;
  408         mtx_unlock(&pbuf_mtx);
  409 
  410         /*
  411          * Initialize our zone.  Right now I'm just guessing on the number
  412          * we need based on the number of pages in the system.  Each swblock
  413          * can hold 16 pages, so this is probably overkill.  This reservation
  414          * is typically limited to around 32MB by default.
  415          */
  416         n = cnt.v_page_count / 2;
  417         if (maxswzone && n > maxswzone / sizeof(struct swblock))
  418                 n = maxswzone / sizeof(struct swblock);
  419         n2 = n;
  420         swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
  421             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
  422         if (swap_zone == NULL)
  423                 panic("failed to create swap_zone.");
  424         do {
  425                 if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
  426                         break;
  427                 /*
  428                  * if the allocation failed, try a zone two thirds the
  429                  * size of the previous attempt.
  430                  */
  431                 n -= ((n + 2) / 3);
  432         } while (n > 0);
  433         if (n2 != n)
  434                 printf("Swap zone entries reduced from %d to %d.\n", n2, n);
  435         n2 = n;
  436 
  437         /*
  438          * Initialize our meta-data hash table.  The swapper does not need to
  439          * be quite as efficient as the VM system, so we do not use an 
  440          * oversized hash table.
  441          *
  442          *      n:              size of hash table, must be power of 2
  443          *      swhash_mask:    hash table index mask
  444          */
  445         for (n = 1; n < n2 / 8; n *= 2)
  446                 ;
  447         swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
  448         swhash_mask = n - 1;
  449         mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
  450 }
  451 
  452 /*
  453  * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
  454  *                      its metadata structures.
  455  *
  456  *      This routine is called from the mmap and fork code to create a new
  457  *      OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
  458  *      and then converting it with swp_pager_meta_build().
  459  *
  460  *      This routine may block in vm_object_allocate() and create a named
  461  *      object lookup race, so we must interlock.   We must also run at
  462  *      splvm() for the object lookup to handle races with interrupts, but
  463  *      we do not have to maintain splvm() in between the lookup and the
  464  *      add because (I believe) it is not possible to attempt to create
  465  *      a new swap object w/handle when a default object with that handle
  466  *      already exists.
  467  *
  468  * MPSAFE
  469  */
  470 static vm_object_t
  471 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
  472                  vm_ooffset_t offset)
  473 {
  474         vm_object_t object;
  475         vm_pindex_t pindex;
  476 
  477         pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
  478 
  479         if (handle) {
  480                 mtx_lock(&Giant);
  481                 /*
  482                  * Reference existing named region or allocate new one.  There
  483                  * should not be a race here against swp_pager_meta_build()
  484                  * as called from vm_page_remove() in regards to the lookup
  485                  * of the handle.
  486                  */
  487                 sx_xlock(&sw_alloc_sx);
  488                 object = vm_pager_object_lookup(NOBJLIST(handle), handle);
  489 
  490                 if (object != NULL) {
  491                         vm_object_reference(object);
  492                 } else {
  493                         object = vm_object_allocate(OBJT_DEFAULT, pindex);
  494                         object->handle = handle;
  495 
  496                         VM_OBJECT_LOCK(object);
  497                         swp_pager_meta_build(object, 0, SWAPBLK_NONE);
  498                         VM_OBJECT_UNLOCK(object);
  499                 }
  500                 sx_xunlock(&sw_alloc_sx);
  501                 mtx_unlock(&Giant);
  502         } else {
  503                 object = vm_object_allocate(OBJT_DEFAULT, pindex);
  504 
  505                 VM_OBJECT_LOCK(object);
  506                 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
  507                 VM_OBJECT_UNLOCK(object);
  508         }
  509         return (object);
  510 }
  511 
  512 /*
  513  * SWAP_PAGER_DEALLOC() -       remove swap metadata from object
  514  *
  515  *      The swap backing for the object is destroyed.  The code is 
  516  *      designed such that we can reinstantiate it later, but this
  517  *      routine is typically called only when the entire object is
  518  *      about to be destroyed.
  519  *
  520  *      This routine may block, but no longer does. 
  521  *
  522  *      The object must be locked or unreferenceable.
  523  */
  524 static void
  525 swap_pager_dealloc(vm_object_t object)
  526 {
  527         int s;
  528 
  529         /*
  530          * Remove from list right away so lookups will fail if we block for
  531          * pageout completion.
  532          */
  533         if (object->handle != NULL) {
  534                 mtx_lock(&sw_alloc_mtx);
  535                 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
  536                 mtx_unlock(&sw_alloc_mtx);
  537         }
  538 
  539         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  540         vm_object_pip_wait(object, "swpdea");
  541 
  542         /*
  543          * Free all remaining metadata.  We only bother to free it from 
  544          * the swap meta data.  We do not attempt to free swapblk's still
  545          * associated with vm_page_t's for this object.  We do not care
  546          * if paging is still in progress on some objects.
  547          */
  548         s = splvm();
  549         swp_pager_meta_free_all(object);
  550         splx(s);
  551 }
  552 
  553 /************************************************************************
  554  *                      SWAP PAGER BITMAP ROUTINES                      *
  555  ************************************************************************/
  556 
  557 /*
  558  * SWP_PAGER_GETSWAPSPACE() -   allocate raw swap space
  559  *
  560  *      Allocate swap for the requested number of pages.  The starting
  561  *      swap block number (a page index) is returned or SWAPBLK_NONE
  562  *      if the allocation failed.
  563  *
  564  *      Also has the side effect of advising that somebody made a mistake
  565  *      when they configured swap and didn't configure enough.
  566  *
  567  *      Must be called at splvm() to avoid races with bitmap frees from
  568  *      vm_page_remove() aka swap_pager_page_removed().
  569  *
  570  *      This routine may not block
  571  *      This routine must be called at splvm().
  572  *
  573  *      We allocate in round-robin fashion from the configured devices.
  574  */
  575 static daddr_t
  576 swp_pager_getswapspace(int npages)
  577 {
  578         daddr_t blk;
  579         struct swdevt *sp;
  580         int i;
  581 
  582         blk = SWAPBLK_NONE;
  583         mtx_lock(&sw_dev_mtx);
  584         sp = swdevhd;
  585         for (i = 0; i < nswapdev; i++) {
  586                 if (sp == NULL)
  587                         sp = TAILQ_FIRST(&swtailq);
  588                 if (!(sp->sw_flags & SW_CLOSING)) {
  589                         blk = blist_alloc(sp->sw_blist, npages);
  590                         if (blk != SWAPBLK_NONE) {
  591                                 blk += sp->sw_first;
  592                                 sp->sw_used += npages;
  593                                 swap_pager_avail -= npages;
  594                                 swp_sizecheck();
  595                                 swdevhd = TAILQ_NEXT(sp, sw_list);
  596                                 goto done;
  597                         }
  598                 }
  599                 sp = TAILQ_NEXT(sp, sw_list);
  600         }
  601         if (swap_pager_full != 2) {
  602                 printf("swap_pager_getswapspace(%d): failed\n", npages);
  603                 swap_pager_full = 2;
  604                 swap_pager_almost_full = 1;
  605         }
  606         swdevhd = NULL;
  607 done:
  608         mtx_unlock(&sw_dev_mtx);
  609         return (blk);
  610 }
  611 
  612 static int
  613 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
  614 {
  615 
  616         return (blk >= sp->sw_first && blk < sp->sw_end);
  617 }
  618         
  619 static void
  620 swp_pager_strategy(struct buf *bp)
  621 {
  622         struct swdevt *sp;
  623 
  624         mtx_lock(&sw_dev_mtx);
  625         TAILQ_FOREACH(sp, &swtailq, sw_list) {
  626                 if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
  627                         mtx_unlock(&sw_dev_mtx);
  628                         sp->sw_strategy(bp, sp);
  629                         return;
  630                 }
  631         }
  632         panic("Swapdev not found");
  633 }
  634         
  635 
  636 /*
  637  * SWP_PAGER_FREESWAPSPACE() -  free raw swap space 
  638  *
  639  *      This routine returns the specified swap blocks back to the bitmap.
  640  *
  641  *      Note:  This routine may not block (it could in the old swap code),
  642  *      and through the use of the new blist routines it does not block.
  643  *
  644  *      We must be called at splvm() to avoid races with bitmap frees from
  645  *      vm_page_remove() aka swap_pager_page_removed().
  646  *
  647  *      This routine may not block
  648  *      This routine must be called at splvm().
  649  */
  650 static void
  651 swp_pager_freeswapspace(daddr_t blk, int npages)
  652 {
  653         struct swdevt *sp;
  654 
  655         mtx_lock(&sw_dev_mtx);
  656         TAILQ_FOREACH(sp, &swtailq, sw_list) {
  657                 if (blk >= sp->sw_first && blk < sp->sw_end) {
  658                         sp->sw_used -= npages;
  659                         /*
  660                          * If we are attempting to stop swapping on
  661                          * this device, we don't want to mark any
  662                          * blocks free lest they be reused.  
  663                          */
  664                         if ((sp->sw_flags & SW_CLOSING) == 0) {
  665                                 blist_free(sp->sw_blist, blk - sp->sw_first,
  666                                     npages);
  667                                 swap_pager_avail += npages;
  668                                 swp_sizecheck();
  669                         }
  670                         mtx_unlock(&sw_dev_mtx);
  671                         return;
  672                 }
  673         }
  674         panic("Swapdev not found");
  675 }
  676 
  677 /*
  678  * SWAP_PAGER_FREESPACE() -     frees swap blocks associated with a page
  679  *                              range within an object.
  680  *
  681  *      This is a globally accessible routine.
  682  *
  683  *      This routine removes swapblk assignments from swap metadata.
  684  *
  685  *      The external callers of this routine typically have already destroyed 
  686  *      or renamed vm_page_t's associated with this range in the object so 
  687  *      we should be ok.
  688  *
  689  *      This routine may be called at any spl.  We up our spl to splvm temporarily
  690  *      in order to perform the metadata removal.
  691  */
  692 void
  693 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
  694 {
  695         int s = splvm();
  696 
  697         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  698         swp_pager_meta_free(object, start, size);
  699         splx(s);
  700 }
  701 
  702 /*
  703  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  704  *
  705  *      Assigns swap blocks to the specified range within the object.  The 
  706  *      swap blocks are not zerod.  Any previous swap assignment is destroyed.
  707  *
  708  *      Returns 0 on success, -1 on failure.
  709  */
  710 int
  711 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
  712 {
  713         int s;
  714         int n = 0;
  715         daddr_t blk = SWAPBLK_NONE;
  716         vm_pindex_t beg = start;        /* save start index */
  717 
  718         s = splvm();
  719         VM_OBJECT_LOCK(object);
  720         while (size) {
  721                 if (n == 0) {
  722                         n = BLIST_MAX_ALLOC;
  723                         while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
  724                                 n >>= 1;
  725                                 if (n == 0) {
  726                                         swp_pager_meta_free(object, beg, start - beg);
  727                                         VM_OBJECT_UNLOCK(object);
  728                                         splx(s);
  729                                         return (-1);
  730                                 }
  731                         }
  732                 }
  733                 swp_pager_meta_build(object, start, blk);
  734                 --size;
  735                 ++start;
  736                 ++blk;
  737                 --n;
  738         }
  739         swp_pager_meta_free(object, start, n);
  740         VM_OBJECT_UNLOCK(object);
  741         splx(s);
  742         return (0);
  743 }
  744 
  745 /*
  746  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  747  *                      and destroy the source.
  748  *
  749  *      Copy any valid swapblks from the source to the destination.  In
  750  *      cases where both the source and destination have a valid swapblk,
  751  *      we keep the destination's.
  752  *
  753  *      This routine is allowed to block.  It may block allocating metadata
  754  *      indirectly through swp_pager_meta_build() or if paging is still in
  755  *      progress on the source. 
  756  *
  757  *      This routine can be called at any spl
  758  *
  759  *      XXX vm_page_collapse() kinda expects us not to block because we 
  760  *      supposedly do not need to allocate memory, but for the moment we
  761  *      *may* have to get a little memory from the zone allocator, but
  762  *      it is taken from the interrupt memory.  We should be ok. 
  763  *
  764  *      The source object contains no vm_page_t's (which is just as well)
  765  *
  766  *      The source object is of type OBJT_SWAP.
  767  *
  768  *      The source and destination objects must be locked or 
  769  *      inaccessible (XXX are they ?)
  770  */
  771 void
  772 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
  773     vm_pindex_t offset, int destroysource)
  774 {
  775         vm_pindex_t i;
  776         int s;
  777 
  778         VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
  779         VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
  780 
  781         s = splvm();
  782         /*
  783          * If destroysource is set, we remove the source object from the 
  784          * swap_pager internal queue now. 
  785          */
  786         if (destroysource) {
  787                 if (srcobject->handle != NULL) {
  788                         mtx_lock(&sw_alloc_mtx);
  789                         TAILQ_REMOVE(
  790                             NOBJLIST(srcobject->handle),
  791                             srcobject,
  792                             pager_object_list
  793                         );
  794                         mtx_unlock(&sw_alloc_mtx);
  795                 }
  796         }
  797 
  798         /*
  799          * transfer source to destination.
  800          */
  801         for (i = 0; i < dstobject->size; ++i) {
  802                 daddr_t dstaddr;
  803 
  804                 /*
  805                  * Locate (without changing) the swapblk on the destination,
  806                  * unless it is invalid in which case free it silently, or
  807                  * if the destination is a resident page, in which case the
  808                  * source is thrown away.
  809                  */
  810                 dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
  811 
  812                 if (dstaddr == SWAPBLK_NONE) {
  813                         /*
  814                          * Destination has no swapblk and is not resident,
  815                          * copy source.
  816                          */
  817                         daddr_t srcaddr;
  818 
  819                         srcaddr = swp_pager_meta_ctl(
  820                             srcobject, 
  821                             i + offset,
  822                             SWM_POP
  823                         );
  824 
  825                         if (srcaddr != SWAPBLK_NONE) {
  826                                 /*
  827                                  * swp_pager_meta_build() can sleep.
  828                                  */
  829                                 vm_object_pip_add(srcobject, 1);
  830                                 VM_OBJECT_UNLOCK(srcobject);
  831                                 vm_object_pip_add(dstobject, 1);
  832                                 swp_pager_meta_build(dstobject, i, srcaddr);
  833                                 vm_object_pip_wakeup(dstobject);
  834                                 VM_OBJECT_LOCK(srcobject);
  835                                 vm_object_pip_wakeup(srcobject);
  836                         }
  837                 } else {
  838                         /*
  839                          * Destination has valid swapblk or it is represented
  840                          * by a resident page.  We destroy the sourceblock.
  841                          */
  842                         
  843                         swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
  844                 }
  845         }
  846 
  847         /*
  848          * Free left over swap blocks in source.
  849          *
  850          * We have to revert the type to OBJT_DEFAULT so we do not accidently
  851          * double-remove the object from the swap queues.
  852          */
  853         if (destroysource) {
  854                 swp_pager_meta_free_all(srcobject);
  855                 /*
  856                  * Reverting the type is not necessary, the caller is going
  857                  * to destroy srcobject directly, but I'm doing it here
  858                  * for consistency since we've removed the object from its
  859                  * queues.
  860                  */
  861                 srcobject->type = OBJT_DEFAULT;
  862         }
  863         splx(s);
  864 }
  865 
  866 /*
  867  * SWAP_PAGER_HASPAGE() -       determine if we have good backing store for
  868  *                              the requested page.
  869  *
  870  *      We determine whether good backing store exists for the requested
  871  *      page and return TRUE if it does, FALSE if it doesn't.
  872  *
  873  *      If TRUE, we also try to determine how much valid, contiguous backing
  874  *      store exists before and after the requested page within a reasonable
  875  *      distance.  We do not try to restrict it to the swap device stripe
  876  *      (that is handled in getpages/putpages).  It probably isn't worth
  877  *      doing here.
  878  */
  879 static boolean_t
  880 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
  881 {
  882         daddr_t blk0;
  883         int s;
  884 
  885         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  886         /*
  887          * do we have good backing store at the requested index ?
  888          */
  889         s = splvm();
  890         blk0 = swp_pager_meta_ctl(object, pindex, 0);
  891 
  892         if (blk0 == SWAPBLK_NONE) {
  893                 splx(s);
  894                 if (before)
  895                         *before = 0;
  896                 if (after)
  897                         *after = 0;
  898                 return (FALSE);
  899         }
  900 
  901         /*
  902          * find backwards-looking contiguous good backing store
  903          */
  904         if (before != NULL) {
  905                 int i;
  906 
  907                 for (i = 1; i < (SWB_NPAGES/2); ++i) {
  908                         daddr_t blk;
  909 
  910                         if (i > pindex)
  911                                 break;
  912                         blk = swp_pager_meta_ctl(object, pindex - i, 0);
  913                         if (blk != blk0 - i)
  914                                 break;
  915                 }
  916                 *before = (i - 1);
  917         }
  918 
  919         /*
  920          * find forward-looking contiguous good backing store
  921          */
  922         if (after != NULL) {
  923                 int i;
  924 
  925                 for (i = 1; i < (SWB_NPAGES/2); ++i) {
  926                         daddr_t blk;
  927 
  928                         blk = swp_pager_meta_ctl(object, pindex + i, 0);
  929                         if (blk != blk0 + i)
  930                                 break;
  931                 }
  932                 *after = (i - 1);
  933         }
  934         splx(s);
  935         return (TRUE);
  936 }
  937 
  938 /*
  939  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  940  *
  941  *      This removes any associated swap backing store, whether valid or
  942  *      not, from the page.  
  943  *
  944  *      This routine is typically called when a page is made dirty, at
  945  *      which point any associated swap can be freed.  MADV_FREE also
  946  *      calls us in a special-case situation
  947  *
  948  *      NOTE!!!  If the page is clean and the swap was valid, the caller
  949  *      should make the page dirty before calling this routine.  This routine
  950  *      does NOT change the m->dirty status of the page.  Also: MADV_FREE
  951  *      depends on it.
  952  *
  953  *      This routine may not block
  954  *      This routine must be called at splvm()
  955  */
  956 static void
  957 swap_pager_unswapped(vm_page_t m)
  958 {
  959 
  960         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  961         swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
  962 }
  963 
  964 /*
  965  * SWAP_PAGER_GETPAGES() - bring pages in from swap
  966  *
  967  *      Attempt to retrieve (m, count) pages from backing store, but make
  968  *      sure we retrieve at least m[reqpage].  We try to load in as large
  969  *      a chunk surrounding m[reqpage] as is contiguous in swap and which
  970  *      belongs to the same object.
  971  *
  972  *      The code is designed for asynchronous operation and 
  973  *      immediate-notification of 'reqpage' but tends not to be
  974  *      used that way.  Please do not optimize-out this algorithmic
  975  *      feature, I intend to improve on it in the future.
  976  *
  977  *      The parent has a single vm_object_pip_add() reference prior to
  978  *      calling us and we should return with the same.
  979  *
  980  *      The parent has BUSY'd the pages.  We should return with 'm'
  981  *      left busy, but the others adjusted.
  982  */
  983 static int
  984 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
  985 {
  986         struct buf *bp;
  987         vm_page_t mreq;
  988         int s;
  989         int i;
  990         int j;
  991         daddr_t blk;
  992 
  993         mreq = m[reqpage];
  994 
  995         KASSERT(mreq->object == object,
  996             ("swap_pager_getpages: object mismatch %p/%p",
  997             object, mreq->object));
  998 
  999         /*
 1000          * Calculate range to retrieve.  The pages have already been assigned
 1001          * their swapblks.  We require a *contiguous* range but we know it to
 1002          * not span devices.   If we do not supply it, bad things
 1003          * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 
 1004          * loops are set up such that the case(s) are handled implicitly.
 1005          *
 1006          * The swp_*() calls must be made at splvm().  vm_page_free() does
 1007          * not need to be, but it will go a little faster if it is.
 1008          */
 1009         s = splvm();
 1010         blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
 1011 
 1012         for (i = reqpage - 1; i >= 0; --i) {
 1013                 daddr_t iblk;
 1014 
 1015                 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
 1016                 if (blk != iblk + (reqpage - i))
 1017                         break;
 1018         }
 1019         ++i;
 1020 
 1021         for (j = reqpage + 1; j < count; ++j) {
 1022                 daddr_t jblk;
 1023 
 1024                 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
 1025                 if (blk != jblk - (j - reqpage))
 1026                         break;
 1027         }
 1028 
 1029         /*
 1030          * free pages outside our collection range.   Note: we never free
 1031          * mreq, it must remain busy throughout.
 1032          */
 1033         vm_page_lock_queues();
 1034         {
 1035                 int k;
 1036 
 1037                 for (k = 0; k < i; ++k)
 1038                         vm_page_free(m[k]);
 1039                 for (k = j; k < count; ++k)
 1040                         vm_page_free(m[k]);
 1041         }
 1042         vm_page_unlock_queues();
 1043         splx(s);
 1044 
 1045 
 1046         /*
 1047          * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq 
 1048          * still busy, but the others unbusied.
 1049          */
 1050         if (blk == SWAPBLK_NONE)
 1051                 return (VM_PAGER_FAIL);
 1052 
 1053         /*
 1054          * Getpbuf() can sleep.
 1055          */
 1056         VM_OBJECT_UNLOCK(object);
 1057         /*
 1058          * Get a swap buffer header to perform the IO
 1059          */
 1060         bp = getpbuf(&nsw_rcount);
 1061         bp->b_flags |= B_PAGING;
 1062 
 1063         /*
 1064          * map our page(s) into kva for input
 1065          */
 1066         pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
 1067 
 1068         bp->b_iocmd = BIO_READ;
 1069         bp->b_iodone = swp_pager_async_iodone;
 1070         bp->b_rcred = crhold(thread0.td_ucred);
 1071         bp->b_wcred = crhold(thread0.td_ucred);
 1072         bp->b_blkno = blk - (reqpage - i);
 1073         bp->b_bcount = PAGE_SIZE * (j - i);
 1074         bp->b_bufsize = PAGE_SIZE * (j - i);
 1075         bp->b_pager.pg_reqpage = reqpage - i;
 1076 
 1077         VM_OBJECT_LOCK(object);
 1078         vm_page_lock_queues();
 1079         {
 1080                 int k;
 1081 
 1082                 for (k = i; k < j; ++k) {
 1083                         bp->b_pages[k - i] = m[k];
 1084                         vm_page_flag_set(m[k], PG_SWAPINPROG);
 1085                 }
 1086         }
 1087         vm_page_unlock_queues();
 1088         VM_OBJECT_UNLOCK(object);
 1089         bp->b_npages = j - i;
 1090 
 1091         cnt.v_swapin++;
 1092         cnt.v_swappgsin += bp->b_npages;
 1093 
 1094         /*
 1095          * We still hold the lock on mreq, and our automatic completion routine
 1096          * does not remove it.
 1097          */
 1098         VM_OBJECT_LOCK(mreq->object);
 1099         vm_object_pip_add(mreq->object, bp->b_npages);
 1100         VM_OBJECT_UNLOCK(mreq->object);
 1101 
 1102         /*
 1103          * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 1104          * this point because we automatically release it on completion.
 1105          * Instead, we look at the one page we are interested in which we
 1106          * still hold a lock on even through the I/O completion.
 1107          *
 1108          * The other pages in our m[] array are also released on completion,
 1109          * so we cannot assume they are valid anymore either.
 1110          *
 1111          * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1112          */
 1113         BUF_KERNPROC(bp);
 1114         swp_pager_strategy(bp);
 1115 
 1116         /*
 1117          * wait for the page we want to complete.  PG_SWAPINPROG is always
 1118          * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 1119          * is set in the meta-data.
 1120          */
 1121         s = splvm();
 1122         vm_page_lock_queues();
 1123         while ((mreq->flags & PG_SWAPINPROG) != 0) {
 1124                 vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
 1125                 cnt.v_intrans++;
 1126                 if (msleep(mreq, &vm_page_queue_mtx, PSWP, "swread", hz*20)) {
 1127                         printf(
 1128 "swap_pager: indefinite wait buffer: device: %s, blkno: %jd, size: %ld\n",
 1129                             bp->b_dev == NULL ? "[NULL]" : devtoname(bp->b_dev),
 1130                             (intmax_t)bp->b_blkno, bp->b_bcount);
 1131                 }
 1132         }
 1133         vm_page_unlock_queues();
 1134         splx(s);
 1135 
 1136         VM_OBJECT_LOCK(mreq->object);
 1137         /*
 1138          * mreq is left busied after completion, but all the other pages
 1139          * are freed.  If we had an unrecoverable read error the page will
 1140          * not be valid.
 1141          */
 1142         if (mreq->valid != VM_PAGE_BITS_ALL) {
 1143                 return (VM_PAGER_ERROR);
 1144         } else {
 1145                 return (VM_PAGER_OK);
 1146         }
 1147 
 1148         /*
 1149          * A final note: in a low swap situation, we cannot deallocate swap
 1150          * and mark a page dirty here because the caller is likely to mark
 1151          * the page clean when we return, causing the page to possibly revert 
 1152          * to all-zero's later.
 1153          */
 1154 }
 1155 
 1156 /*
 1157  *      swap_pager_putpages: 
 1158  *
 1159  *      Assign swap (if necessary) and initiate I/O on the specified pages.
 1160  *
 1161  *      We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
 1162  *      are automatically converted to SWAP objects.
 1163  *
 1164  *      In a low memory situation we may block in VOP_STRATEGY(), but the new 
 1165  *      vm_page reservation system coupled with properly written VFS devices 
 1166  *      should ensure that no low-memory deadlock occurs.  This is an area
 1167  *      which needs work.
 1168  *
 1169  *      The parent has N vm_object_pip_add() references prior to
 1170  *      calling us and will remove references for rtvals[] that are
 1171  *      not set to VM_PAGER_PEND.  We need to remove the rest on I/O
 1172  *      completion.
 1173  *
 1174  *      The parent has soft-busy'd the pages it passes us and will unbusy
 1175  *      those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
 1176  *      We need to unbusy the rest on I/O completion.
 1177  */
 1178 void
 1179 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
 1180     boolean_t sync, int *rtvals)
 1181 {
 1182         int i;
 1183         int n = 0;
 1184 
 1185         GIANT_REQUIRED;
 1186         if (count && m[0]->object != object) {
 1187                 panic("swap_pager_getpages: object mismatch %p/%p", 
 1188                     object, 
 1189                     m[0]->object
 1190                 );
 1191         }
 1192 
 1193         /*
 1194          * Step 1
 1195          *
 1196          * Turn object into OBJT_SWAP
 1197          * check for bogus sysops
 1198          * force sync if not pageout process
 1199          */
 1200         if (object->type != OBJT_SWAP)
 1201                 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 1202         VM_OBJECT_UNLOCK(object);
 1203 
 1204         if (curproc != pageproc)
 1205                 sync = TRUE;
 1206 
 1207         /*
 1208          * Step 2
 1209          *
 1210          * Update nsw parameters from swap_async_max sysctl values.  
 1211          * Do not let the sysop crash the machine with bogus numbers.
 1212          */
 1213         mtx_lock(&pbuf_mtx);
 1214         if (swap_async_max != nsw_wcount_async_max) {
 1215                 int n;
 1216                 int s;
 1217 
 1218                 /*
 1219                  * limit range
 1220                  */
 1221                 if ((n = swap_async_max) > nswbuf / 2)
 1222                         n = nswbuf / 2;
 1223                 if (n < 1)
 1224                         n = 1;
 1225                 swap_async_max = n;
 1226 
 1227                 /*
 1228                  * Adjust difference ( if possible ).  If the current async
 1229                  * count is too low, we may not be able to make the adjustment
 1230                  * at this time.
 1231                  */
 1232                 s = splvm();
 1233                 n -= nsw_wcount_async_max;
 1234                 if (nsw_wcount_async + n >= 0) {
 1235                         nsw_wcount_async += n;
 1236                         nsw_wcount_async_max += n;
 1237                         wakeup(&nsw_wcount_async);
 1238                 }
 1239                 splx(s);
 1240         }
 1241         mtx_unlock(&pbuf_mtx);
 1242 
 1243         /*
 1244          * Step 3
 1245          *
 1246          * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 1247          * The page is left dirty until the pageout operation completes
 1248          * successfully.
 1249          */
 1250         for (i = 0; i < count; i += n) {
 1251                 int s;
 1252                 int j;
 1253                 struct buf *bp;
 1254                 daddr_t blk;
 1255 
 1256                 /*
 1257                  * Maximum I/O size is limited by a number of factors.
 1258                  */
 1259                 n = min(BLIST_MAX_ALLOC, count - i);
 1260                 n = min(n, nsw_cluster_max);
 1261 
 1262                 s = splvm();
 1263 
 1264                 /*
 1265                  * Get biggest block of swap we can.  If we fail, fall
 1266                  * back and try to allocate a smaller block.  Don't go
 1267                  * overboard trying to allocate space if it would overly
 1268                  * fragment swap.
 1269                  */
 1270                 while (
 1271                     (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
 1272                     n > 4
 1273                 ) {
 1274                         n >>= 1;
 1275                 }
 1276                 if (blk == SWAPBLK_NONE) {
 1277                         for (j = 0; j < n; ++j)
 1278                                 rtvals[i+j] = VM_PAGER_FAIL;
 1279                         splx(s);
 1280                         continue;
 1281                 }
 1282 
 1283                 /*
 1284                  * All I/O parameters have been satisfied, build the I/O
 1285                  * request and assign the swap space.
 1286                  */
 1287                 if (sync == TRUE) {
 1288                         bp = getpbuf(&nsw_wcount_sync);
 1289                 } else {
 1290                         bp = getpbuf(&nsw_wcount_async);
 1291                         bp->b_flags = B_ASYNC;
 1292                 }
 1293                 bp->b_flags |= B_PAGING;
 1294                 bp->b_iocmd = BIO_WRITE;
 1295 
 1296                 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 1297 
 1298                 bp->b_rcred = crhold(thread0.td_ucred);
 1299                 bp->b_wcred = crhold(thread0.td_ucred);
 1300                 bp->b_bcount = PAGE_SIZE * n;
 1301                 bp->b_bufsize = PAGE_SIZE * n;
 1302                 bp->b_blkno = blk;
 1303 
 1304                 VM_OBJECT_LOCK(object);
 1305                 for (j = 0; j < n; ++j) {
 1306                         vm_page_t mreq = m[i+j];
 1307 
 1308                         swp_pager_meta_build(
 1309                             mreq->object, 
 1310                             mreq->pindex,
 1311                             blk + j
 1312                         );
 1313                         vm_page_dirty(mreq);
 1314                         rtvals[i+j] = VM_PAGER_OK;
 1315 
 1316                         vm_page_lock_queues();
 1317                         vm_page_flag_set(mreq, PG_SWAPINPROG);
 1318                         vm_page_unlock_queues();
 1319                         bp->b_pages[j] = mreq;
 1320                 }
 1321                 VM_OBJECT_UNLOCK(object);
 1322                 bp->b_npages = n;
 1323                 /*
 1324                  * Must set dirty range for NFS to work.
 1325                  */
 1326                 bp->b_dirtyoff = 0;
 1327                 bp->b_dirtyend = bp->b_bcount;
 1328 
 1329                 cnt.v_swapout++;
 1330                 cnt.v_swappgsout += bp->b_npages;
 1331 
 1332                 splx(s);
 1333 
 1334                 /*
 1335                  * asynchronous
 1336                  *
 1337                  * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1338                  */
 1339                 if (sync == FALSE) {
 1340                         bp->b_iodone = swp_pager_async_iodone;
 1341                         BUF_KERNPROC(bp);
 1342                         swp_pager_strategy(bp);
 1343 
 1344                         for (j = 0; j < n; ++j)
 1345                                 rtvals[i+j] = VM_PAGER_PEND;
 1346                         /* restart outter loop */
 1347                         continue;
 1348                 }
 1349 
 1350                 /*
 1351                  * synchronous
 1352                  *
 1353                  * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1354                  */
 1355                 bp->b_iodone = bdone;
 1356                 swp_pager_strategy(bp);
 1357 
 1358                 /*
 1359                  * Wait for the sync I/O to complete, then update rtvals.
 1360                  * We just set the rtvals[] to VM_PAGER_PEND so we can call
 1361                  * our async completion routine at the end, thus avoiding a
 1362                  * double-free.
 1363                  */
 1364                 s = splbio();
 1365                 bwait(bp, PVM, "swwrt");
 1366                 for (j = 0; j < n; ++j)
 1367                         rtvals[i+j] = VM_PAGER_PEND;
 1368                 /*
 1369                  * Now that we are through with the bp, we can call the
 1370                  * normal async completion, which frees everything up.
 1371                  */
 1372                 swp_pager_async_iodone(bp);
 1373                 splx(s);
 1374         }
 1375         VM_OBJECT_LOCK(object);
 1376 }
 1377 
 1378 /*
 1379  *      swp_pager_async_iodone:
 1380  *
 1381  *      Completion routine for asynchronous reads and writes from/to swap.
 1382  *      Also called manually by synchronous code to finish up a bp.
 1383  *
 1384  *      For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
 1385  *      the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
 1386  *      unbusy all pages except the 'main' request page.  For WRITE 
 1387  *      operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
 1388  *      because we marked them all VM_PAGER_PEND on return from putpages ).
 1389  *
 1390  *      This routine may not block.
 1391  *      This routine is called at splbio() or better
 1392  *
 1393  *      We up ourselves to splvm() as required for various vm_page related
 1394  *      calls.
 1395  */
 1396 static void
 1397 swp_pager_async_iodone(struct buf *bp)
 1398 {
 1399         int s;
 1400         int i;
 1401         vm_object_t object = NULL;
 1402 
 1403         bp->b_flags |= B_DONE;
 1404 
 1405         /*
 1406          * report error
 1407          */
 1408         if (bp->b_ioflags & BIO_ERROR) {
 1409                 printf(
 1410                     "swap_pager: I/O error - %s failed; blkno %ld,"
 1411                         "size %ld, error %d\n",
 1412                     ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 1413                     (long)bp->b_blkno, 
 1414                     (long)bp->b_bcount,
 1415                     bp->b_error
 1416                 );
 1417         }
 1418 
 1419         /*
 1420          * set object, raise to splvm().
 1421          */
 1422         s = splvm();
 1423 
 1424         /*
 1425          * remove the mapping for kernel virtual
 1426          */
 1427         pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 1428 
 1429         if (bp->b_npages) {
 1430                 object = bp->b_pages[0]->object;
 1431                 VM_OBJECT_LOCK(object);
 1432         }
 1433         vm_page_lock_queues();
 1434         /*
 1435          * cleanup pages.  If an error occurs writing to swap, we are in
 1436          * very serious trouble.  If it happens to be a disk error, though,
 1437          * we may be able to recover by reassigning the swap later on.  So
 1438          * in this case we remove the m->swapblk assignment for the page 
 1439          * but do not free it in the rlist.  The errornous block(s) are thus
 1440          * never reallocated as swap.  Redirty the page and continue.
 1441          */
 1442         for (i = 0; i < bp->b_npages; ++i) {
 1443                 vm_page_t m = bp->b_pages[i];
 1444 
 1445                 vm_page_flag_clear(m, PG_SWAPINPROG);
 1446 
 1447                 if (bp->b_ioflags & BIO_ERROR) {
 1448                         /*
 1449                          * If an error occurs I'd love to throw the swapblk
 1450                          * away without freeing it back to swapspace, so it
 1451                          * can never be used again.  But I can't from an 
 1452                          * interrupt.
 1453                          */
 1454                         if (bp->b_iocmd == BIO_READ) {
 1455                                 /*
 1456                                  * When reading, reqpage needs to stay
 1457                                  * locked for the parent, but all other
 1458                                  * pages can be freed.  We still want to
 1459                                  * wakeup the parent waiting on the page,
 1460                                  * though.  ( also: pg_reqpage can be -1 and 
 1461                                  * not match anything ).
 1462                                  *
 1463                                  * We have to wake specifically requested pages
 1464                                  * up too because we cleared PG_SWAPINPROG and
 1465                                  * someone may be waiting for that.
 1466                                  *
 1467                                  * NOTE: for reads, m->dirty will probably
 1468                                  * be overridden by the original caller of
 1469                                  * getpages so don't play cute tricks here.
 1470                                  *
 1471                                  * XXX IT IS NOT LEGAL TO FREE THE PAGE HERE
 1472                                  * AS THIS MESSES WITH object->memq, and it is
 1473                                  * not legal to mess with object->memq from an
 1474                                  * interrupt.
 1475                                  */
 1476                                 m->valid = 0;
 1477                                 if (i != bp->b_pager.pg_reqpage)
 1478                                         vm_page_free(m);
 1479                                 else
 1480                                         vm_page_flash(m);
 1481                                 /*
 1482                                  * If i == bp->b_pager.pg_reqpage, do not wake 
 1483                                  * the page up.  The caller needs to.
 1484                                  */
 1485                         } else {
 1486                                 /*
 1487                                  * If a write error occurs, reactivate page
 1488                                  * so it doesn't clog the inactive list,
 1489                                  * then finish the I/O.
 1490                                  */
 1491                                 vm_page_dirty(m);
 1492                                 vm_page_activate(m);
 1493                                 vm_page_io_finish(m);
 1494                         }
 1495                 } else if (bp->b_iocmd == BIO_READ) {
 1496                         /*
 1497                          * For read success, clear dirty bits.  Nobody should
 1498                          * have this page mapped but don't take any chances,
 1499                          * make sure the pmap modify bits are also cleared.
 1500                          *
 1501                          * NOTE: for reads, m->dirty will probably be 
 1502                          * overridden by the original caller of getpages so
 1503                          * we cannot set them in order to free the underlying
 1504                          * swap in a low-swap situation.  I don't think we'd
 1505                          * want to do that anyway, but it was an optimization
 1506                          * that existed in the old swapper for a time before
 1507                          * it got ripped out due to precisely this problem.
 1508                          *
 1509                          * If not the requested page then deactivate it.
 1510                          *
 1511                          * Note that the requested page, reqpage, is left
 1512                          * busied, but we still have to wake it up.  The
 1513                          * other pages are released (unbusied) by 
 1514                          * vm_page_wakeup().  We do not set reqpage's
 1515                          * valid bits here, it is up to the caller.
 1516                          */
 1517                         pmap_clear_modify(m);
 1518                         m->valid = VM_PAGE_BITS_ALL;
 1519                         vm_page_undirty(m);
 1520 
 1521                         /*
 1522                          * We have to wake specifically requested pages
 1523                          * up too because we cleared PG_SWAPINPROG and
 1524                          * could be waiting for it in getpages.  However,
 1525                          * be sure to not unbusy getpages specifically
 1526                          * requested page - getpages expects it to be 
 1527                          * left busy.
 1528                          */
 1529                         if (i != bp->b_pager.pg_reqpage) {
 1530                                 vm_page_deactivate(m);
 1531                                 vm_page_wakeup(m);
 1532                         } else {
 1533                                 vm_page_flash(m);
 1534                         }
 1535                 } else {
 1536                         /*
 1537                          * For write success, clear the modify and dirty 
 1538                          * status, then finish the I/O ( which decrements the 
 1539                          * busy count and possibly wakes waiter's up ).
 1540                          */
 1541                         pmap_clear_modify(m);
 1542                         vm_page_undirty(m);
 1543                         vm_page_io_finish(m);
 1544                         if (vm_page_count_severe())
 1545                                 vm_page_try_to_cache(m);
 1546                 }
 1547         }
 1548         vm_page_unlock_queues();
 1549 
 1550         /*
 1551          * adjust pip.  NOTE: the original parent may still have its own
 1552          * pip refs on the object.
 1553          */
 1554         if (object != NULL) {
 1555                 vm_object_pip_wakeupn(object, bp->b_npages);
 1556                 VM_OBJECT_UNLOCK(object);
 1557         }
 1558 
 1559         /*
 1560          * release the physical I/O buffer
 1561          */
 1562         relpbuf(
 1563             bp, 
 1564             ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 
 1565                 ((bp->b_flags & B_ASYNC) ? 
 1566                     &nsw_wcount_async : 
 1567                     &nsw_wcount_sync
 1568                 )
 1569             )
 1570         );
 1571         splx(s);
 1572 }
 1573 
 1574 /*
 1575  *      swap_pager_isswapped:
 1576  *
 1577  *      Return 1 if at least one page in the given object is paged
 1578  *      out to the given swap device.
 1579  *
 1580  *      This routine may not block.
 1581  */
 1582 int
 1583 swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
 1584 {
 1585         daddr_t index = 0;
 1586         int bcount;
 1587         int i;
 1588 
 1589         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1590         if (object->type != OBJT_SWAP)
 1591                 return (0);
 1592 
 1593         mtx_lock(&swhash_mtx);
 1594         for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
 1595                 struct swblock *swap;
 1596 
 1597                 if ((swap = *swp_pager_hash(object, index)) != NULL) {
 1598                         for (i = 0; i < SWAP_META_PAGES; ++i) {
 1599                                 if (swp_pager_isondev(swap->swb_pages[i], sp)) {
 1600                                         mtx_unlock(&swhash_mtx);
 1601                                         return (1);
 1602                                 }
 1603                         }
 1604                 }
 1605                 index += SWAP_META_PAGES;
 1606                 if (index > 0x20000000)
 1607                         panic("swap_pager_isswapped: failed to locate all swap meta blocks");
 1608         }
 1609         mtx_unlock(&swhash_mtx);
 1610         return (0);
 1611 }
 1612 
 1613 /*
 1614  * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
 1615  *
 1616  *      This routine dissociates the page at the given index within a
 1617  *      swap block from its backing store, paging it in if necessary.
 1618  *      If the page is paged in, it is placed in the inactive queue,
 1619  *      since it had its backing store ripped out from under it.
 1620  *      We also attempt to swap in all other pages in the swap block,
 1621  *      we only guarantee that the one at the specified index is
 1622  *      paged in.
 1623  *
 1624  *      XXX - The code to page the whole block in doesn't work, so we
 1625  *            revert to the one-by-one behavior for now.  Sigh.
 1626  */
 1627 static __inline void
 1628 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
 1629 {
 1630         vm_page_t m;
 1631 
 1632         vm_object_pip_add(object, 1);
 1633         m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
 1634         if (m->valid == VM_PAGE_BITS_ALL) {
 1635                 vm_object_pip_subtract(object, 1);
 1636                 vm_page_lock_queues();
 1637                 vm_page_activate(m);
 1638                 vm_page_dirty(m);
 1639                 vm_page_wakeup(m);
 1640                 vm_page_unlock_queues();
 1641                 vm_pager_page_unswapped(m);
 1642                 return;
 1643         }
 1644 
 1645         if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
 1646                 panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
 1647         vm_object_pip_subtract(object, 1);
 1648         vm_page_lock_queues();
 1649         vm_page_dirty(m);
 1650         vm_page_dontneed(m);
 1651         vm_page_wakeup(m);
 1652         vm_page_unlock_queues();
 1653         vm_pager_page_unswapped(m);
 1654 }
 1655 
 1656 /*
 1657  *      swap_pager_swapoff:
 1658  *
 1659  *      Page in all of the pages that have been paged out to the
 1660  *      given device.  The corresponding blocks in the bitmap must be
 1661  *      marked as allocated and the device must be flagged SW_CLOSING.
 1662  *      There may be no processes swapped out to the device.
 1663  *
 1664  *      This routine may block.
 1665  */
 1666 static void
 1667 swap_pager_swapoff(struct swdevt *sp)
 1668 {
 1669         struct swblock *swap;
 1670         int i, j, retries;
 1671 
 1672         GIANT_REQUIRED;
 1673 
 1674         retries = 0;
 1675 full_rescan:
 1676         mtx_lock(&swhash_mtx);
 1677         for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 1678 restart:
 1679                 for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
 1680                         vm_object_t object = swap->swb_object;
 1681                         vm_pindex_t pindex = swap->swb_index;
 1682                         for (j = 0; j < SWAP_META_PAGES; ++j) {
 1683                                 if (swp_pager_isondev(swap->swb_pages[j], sp)) {
 1684                                         /* avoid deadlock */
 1685                                         if (!VM_OBJECT_TRYLOCK(object)) {
 1686                                                 break;
 1687                                         } else {
 1688                                                 mtx_unlock(&swhash_mtx);
 1689                                                 swp_pager_force_pagein(object,
 1690                                                     pindex + j);
 1691                                                 VM_OBJECT_UNLOCK(object);
 1692                                                 mtx_lock(&swhash_mtx);
 1693                                                 goto restart;
 1694                                         }
 1695                                 }
 1696                         }
 1697                 }
 1698         }
 1699         mtx_unlock(&swhash_mtx);
 1700         if (sp->sw_used) {
 1701                 int dummy;
 1702                 /*
 1703                  * Objects may be locked or paging to the device being
 1704                  * removed, so we will miss their pages and need to
 1705                  * make another pass.  We have marked this device as
 1706                  * SW_CLOSING, so the activity should finish soon.
 1707                  */
 1708                 retries++;
 1709                 if (retries > 100) {
 1710                         panic("swapoff: failed to locate %d swap blocks",
 1711                             sp->sw_used);
 1712                 }
 1713                 tsleep(&dummy, PVM, "swpoff", hz / 20);
 1714                 goto full_rescan;
 1715         }
 1716 }
 1717 
 1718 /************************************************************************
 1719  *                              SWAP META DATA                          *
 1720  ************************************************************************
 1721  *
 1722  *      These routines manipulate the swap metadata stored in the 
 1723  *      OBJT_SWAP object.  All swp_*() routines must be called at
 1724  *      splvm() because swap can be freed up by the low level vm_page
 1725  *      code which might be called from interrupts beyond what splbio() covers.
 1726  *
 1727  *      Swap metadata is implemented with a global hash and not directly
 1728  *      linked into the object.  Instead the object simply contains
 1729  *      appropriate tracking counters.
 1730  */
 1731 
 1732 /*
 1733  * SWP_PAGER_META_BUILD() -     add swap block to swap meta data for object
 1734  *
 1735  *      We first convert the object to a swap object if it is a default
 1736  *      object.
 1737  *
 1738  *      The specified swapblk is added to the object's swap metadata.  If
 1739  *      the swapblk is not valid, it is freed instead.  Any previously
 1740  *      assigned swapblk is freed.
 1741  *
 1742  *      This routine must be called at splvm(), except when used to convert
 1743  *      an OBJT_DEFAULT object into an OBJT_SWAP object.
 1744  */
 1745 static void
 1746 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 1747 {
 1748         struct swblock *swap;
 1749         struct swblock **pswap;
 1750         int idx;
 1751 
 1752         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1753         /*
 1754          * Convert default object to swap object if necessary
 1755          */
 1756         if (object->type != OBJT_SWAP) {
 1757                 object->type = OBJT_SWAP;
 1758                 object->un_pager.swp.swp_bcount = 0;
 1759 
 1760                 if (object->handle != NULL) {
 1761                         mtx_lock(&sw_alloc_mtx);
 1762                         TAILQ_INSERT_TAIL(
 1763                             NOBJLIST(object->handle),
 1764                             object, 
 1765                             pager_object_list
 1766                         );
 1767                         mtx_unlock(&sw_alloc_mtx);
 1768                 }
 1769         }
 1770         
 1771         /*
 1772          * Locate hash entry.  If not found create, but if we aren't adding
 1773          * anything just return.  If we run out of space in the map we wait
 1774          * and, since the hash table may have changed, retry.
 1775          */
 1776 retry:
 1777         mtx_lock(&swhash_mtx);
 1778         pswap = swp_pager_hash(object, pindex);
 1779 
 1780         if ((swap = *pswap) == NULL) {
 1781                 int i;
 1782 
 1783                 if (swapblk == SWAPBLK_NONE)
 1784                         goto done;
 1785 
 1786                 swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
 1787                 if (swap == NULL) {
 1788                         mtx_unlock(&swhash_mtx);
 1789                         VM_OBJECT_UNLOCK(object);
 1790                         VM_WAIT;
 1791                         VM_OBJECT_LOCK(object);
 1792                         goto retry;
 1793                 }
 1794 
 1795                 swap->swb_hnext = NULL;
 1796                 swap->swb_object = object;
 1797                 swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
 1798                 swap->swb_count = 0;
 1799 
 1800                 ++object->un_pager.swp.swp_bcount;
 1801 
 1802                 for (i = 0; i < SWAP_META_PAGES; ++i)
 1803                         swap->swb_pages[i] = SWAPBLK_NONE;
 1804         }
 1805 
 1806         /*
 1807          * Delete prior contents of metadata
 1808          */
 1809         idx = pindex & SWAP_META_MASK;
 1810 
 1811         if (swap->swb_pages[idx] != SWAPBLK_NONE) {
 1812                 swp_pager_freeswapspace(swap->swb_pages[idx], 1);
 1813                 --swap->swb_count;
 1814         }
 1815 
 1816         /*
 1817          * Enter block into metadata
 1818          */
 1819         swap->swb_pages[idx] = swapblk;
 1820         if (swapblk != SWAPBLK_NONE)
 1821                 ++swap->swb_count;
 1822 done:
 1823         mtx_unlock(&swhash_mtx);
 1824 }
 1825 
 1826 /*
 1827  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
 1828  *
 1829  *      The requested range of blocks is freed, with any associated swap 
 1830  *      returned to the swap bitmap.
 1831  *
 1832  *      This routine will free swap metadata structures as they are cleaned 
 1833  *      out.  This routine does *NOT* operate on swap metadata associated
 1834  *      with resident pages.
 1835  *
 1836  *      This routine must be called at splvm()
 1837  */
 1838 static void
 1839 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 1840 {
 1841 
 1842         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1843         if (object->type != OBJT_SWAP)
 1844                 return;
 1845 
 1846         while (count > 0) {
 1847                 struct swblock **pswap;
 1848                 struct swblock *swap;
 1849 
 1850                 mtx_lock(&swhash_mtx);
 1851                 pswap = swp_pager_hash(object, index);
 1852 
 1853                 if ((swap = *pswap) != NULL) {
 1854                         daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 1855 
 1856                         if (v != SWAPBLK_NONE) {
 1857                                 swp_pager_freeswapspace(v, 1);
 1858                                 swap->swb_pages[index & SWAP_META_MASK] =
 1859                                         SWAPBLK_NONE;
 1860                                 if (--swap->swb_count == 0) {
 1861                                         *pswap = swap->swb_hnext;
 1862                                         uma_zfree(swap_zone, swap);
 1863                                         --object->un_pager.swp.swp_bcount;
 1864                                 }
 1865                         }
 1866                         --count;
 1867                         ++index;
 1868                 } else {
 1869                         int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
 1870                         count -= n;
 1871                         index += n;
 1872                 }
 1873                 mtx_unlock(&swhash_mtx);
 1874         }
 1875 }
 1876 
 1877 /*
 1878  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
 1879  *
 1880  *      This routine locates and destroys all swap metadata associated with
 1881  *      an object.
 1882  *
 1883  *      This routine must be called at splvm()
 1884  */
 1885 static void
 1886 swp_pager_meta_free_all(vm_object_t object)
 1887 {
 1888         daddr_t index = 0;
 1889 
 1890         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1891         if (object->type != OBJT_SWAP)
 1892                 return;
 1893 
 1894         while (object->un_pager.swp.swp_bcount) {
 1895                 struct swblock **pswap;
 1896                 struct swblock *swap;
 1897 
 1898                 mtx_lock(&swhash_mtx);
 1899                 pswap = swp_pager_hash(object, index);
 1900                 if ((swap = *pswap) != NULL) {
 1901                         int i;
 1902 
 1903                         for (i = 0; i < SWAP_META_PAGES; ++i) {
 1904                                 daddr_t v = swap->swb_pages[i];
 1905                                 if (v != SWAPBLK_NONE) {
 1906                                         --swap->swb_count;
 1907                                         swp_pager_freeswapspace(v, 1);
 1908                                 }
 1909                         }
 1910                         if (swap->swb_count != 0)
 1911                                 panic("swap_pager_meta_free_all: swb_count != 0");
 1912                         *pswap = swap->swb_hnext;
 1913                         uma_zfree(swap_zone, swap);
 1914                         --object->un_pager.swp.swp_bcount;
 1915                 }
 1916                 mtx_unlock(&swhash_mtx);
 1917                 index += SWAP_META_PAGES;
 1918                 if (index > 0x20000000)
 1919                         panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
 1920         }
 1921 }
 1922 
 1923 /*
 1924  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
 1925  *
 1926  *      This routine is capable of looking up, popping, or freeing
 1927  *      swapblk assignments in the swap meta data or in the vm_page_t.
 1928  *      The routine typically returns the swapblk being looked-up, or popped,
 1929  *      or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
 1930  *      was invalid.  This routine will automatically free any invalid 
 1931  *      meta-data swapblks.
 1932  *
 1933  *      It is not possible to store invalid swapblks in the swap meta data
 1934  *      (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
 1935  *
 1936  *      When acting on a busy resident page and paging is in progress, we 
 1937  *      have to wait until paging is complete but otherwise can act on the 
 1938  *      busy page.
 1939  *
 1940  *      This routine must be called at splvm().
 1941  *
 1942  *      SWM_FREE        remove and free swap block from metadata
 1943  *      SWM_POP         remove from meta data but do not free.. pop it out
 1944  */
 1945 static daddr_t
 1946 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 1947 {
 1948         struct swblock **pswap;
 1949         struct swblock *swap;
 1950         daddr_t r1;
 1951         int idx;
 1952 
 1953         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1954         /*
 1955          * The meta data only exists of the object is OBJT_SWAP 
 1956          * and even then might not be allocated yet.
 1957          */
 1958         if (object->type != OBJT_SWAP)
 1959                 return (SWAPBLK_NONE);
 1960 
 1961         r1 = SWAPBLK_NONE;
 1962         mtx_lock(&swhash_mtx);
 1963         pswap = swp_pager_hash(object, pindex);
 1964 
 1965         if ((swap = *pswap) != NULL) {
 1966                 idx = pindex & SWAP_META_MASK;
 1967                 r1 = swap->swb_pages[idx];
 1968 
 1969                 if (r1 != SWAPBLK_NONE) {
 1970                         if (flags & SWM_FREE) {
 1971                                 swp_pager_freeswapspace(r1, 1);
 1972                                 r1 = SWAPBLK_NONE;
 1973                         }
 1974                         if (flags & (SWM_FREE|SWM_POP)) {
 1975                                 swap->swb_pages[idx] = SWAPBLK_NONE;
 1976                                 if (--swap->swb_count == 0) {
 1977                                         *pswap = swap->swb_hnext;
 1978                                         uma_zfree(swap_zone, swap);
 1979                                         --object->un_pager.swp.swp_bcount;
 1980                                 }
 1981                         } 
 1982                 }
 1983         }
 1984         mtx_unlock(&swhash_mtx);
 1985         return (r1);
 1986 }
 1987 
 1988 /*
 1989  * System call swapon(name) enables swapping on device name,
 1990  * which must be in the swdevsw.  Return EBUSY
 1991  * if already swapping on this device.
 1992  */
 1993 #ifndef _SYS_SYSPROTO_H_
 1994 struct swapon_args {
 1995         char *name;
 1996 };
 1997 #endif
 1998 
 1999 /* 
 2000  * MPSAFE
 2001  */
 2002 /* ARGSUSED */
 2003 int
 2004 swapon(struct thread *td, struct swapon_args *uap)
 2005 {
 2006         struct vattr attr;
 2007         struct vnode *vp;
 2008         struct nameidata nd;
 2009         int error;
 2010 
 2011         mtx_lock(&Giant);
 2012         error = suser(td);
 2013         if (error)
 2014                 goto done2;
 2015 
 2016         while (swdev_syscall_active)
 2017             tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 2018         swdev_syscall_active = 1;
 2019 
 2020         /*
 2021          * Swap metadata may not fit in the KVM if we have physical
 2022          * memory of >1GB.
 2023          */
 2024         if (swap_zone == NULL) {
 2025                 error = ENOMEM;
 2026                 goto done;
 2027         }
 2028 
 2029         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
 2030         error = namei(&nd);
 2031         if (error)
 2032                 goto done;
 2033 
 2034         NDFREE(&nd, NDF_ONLY_PNBUF);
 2035         vp = nd.ni_vp;
 2036 
 2037         if (vn_isdisk(vp, &error)) {
 2038                 error = swapongeom(td, vp);
 2039         } else if (vp->v_type == VREG &&
 2040             (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 2041             (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) {
 2042                 /*
 2043                  * Allow direct swapping to NFS regular files in the same
 2044                  * way that nfs_mountroot() sets up diskless swapping.
 2045                  */
 2046                 error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 2047         }
 2048 
 2049         if (error)
 2050                 vrele(vp);
 2051 done:
 2052         swdev_syscall_active = 0;
 2053         wakeup_one(&swdev_syscall_active);
 2054 done2:
 2055         mtx_unlock(&Giant);
 2056         return (error);
 2057 }
 2058 
 2059 static void
 2060 swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
 2061 {
 2062         struct swdevt *sp, *tsp;
 2063         swblk_t dvbase;
 2064         u_long mblocks;
 2065 
 2066         /*
 2067          * If we go beyond this, we get overflows in the radix
 2068          * tree bitmap code.
 2069          */
 2070         mblocks = 0x40000000 / BLIST_META_RADIX;
 2071         if (nblks > mblocks) {
 2072                 printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n",
 2073                         mblocks);
 2074                 nblks = mblocks;
 2075         }
 2076         /*
 2077          * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 2078          * First chop nblks off to page-align it, then convert.
 2079          * 
 2080          * sw->sw_nblks is in page-sized chunks now too.
 2081          */
 2082         nblks &= ~(ctodb(1) - 1);
 2083         nblks = dbtoc(nblks);
 2084 
 2085         sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 2086         sp->sw_vp = vp;
 2087         sp->sw_id = id;
 2088         sp->sw_dev = dev;
 2089         sp->sw_flags = 0;
 2090         sp->sw_nblks = nblks;
 2091         sp->sw_used = 0;
 2092         sp->sw_strategy = strategy;
 2093         sp->sw_close = close;
 2094 
 2095         sp->sw_blist = blist_create(nblks);
 2096         /*
 2097          * Do not free the first two block in order to avoid overwriting
 2098          * any bsd label at the front of the partition
 2099          */
 2100         blist_free(sp->sw_blist, 2, nblks - 2);
 2101 
 2102         dvbase = 0;
 2103         mtx_lock(&sw_dev_mtx);
 2104         TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 2105                 if (tsp->sw_end >= dvbase) {
 2106                         /*
 2107                          * We put one uncovered page between the devices
 2108                          * in order to definitively prevent any cross-device
 2109                          * I/O requests
 2110                          */
 2111                         dvbase = tsp->sw_end + 1;
 2112                 }
 2113         }
 2114         sp->sw_first = dvbase;
 2115         sp->sw_end = dvbase + nblks;
 2116         TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 2117         nswapdev++;
 2118         swap_pager_avail += nblks;
 2119         swp_sizecheck();
 2120         mtx_unlock(&sw_dev_mtx);
 2121 }
 2122 
 2123 /*
 2124  * SYSCALL: swapoff(devname)
 2125  *
 2126  * Disable swapping on the given device.
 2127  *
 2128  * XXX: Badly designed system call: it should use a device index
 2129  * rather than filename as specification.  We keep sw_vp around
 2130  * only to make this work.
 2131  */
 2132 #ifndef _SYS_SYSPROTO_H_
 2133 struct swapoff_args {
 2134         char *name;
 2135 };
 2136 #endif
 2137 
 2138 /*
 2139  * MPSAFE
 2140  */
 2141 /* ARGSUSED */
 2142 int
 2143 swapoff(struct thread *td, struct swapoff_args *uap)
 2144 {
 2145         struct vnode *vp;
 2146         struct nameidata nd;
 2147         struct swdevt *sp;
 2148         u_long nblks, dvbase;
 2149         int error;
 2150 
 2151         mtx_lock(&Giant);
 2152 
 2153         error = suser(td);
 2154         if (error)
 2155                 goto done2;
 2156 
 2157         while (swdev_syscall_active)
 2158             tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 2159         swdev_syscall_active = 1;
 2160 
 2161         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
 2162         error = namei(&nd);
 2163         if (error)
 2164                 goto done;
 2165         NDFREE(&nd, NDF_ONLY_PNBUF);
 2166         vp = nd.ni_vp;
 2167 
 2168         mtx_lock(&sw_dev_mtx);
 2169         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2170                 if (sp->sw_vp == vp)
 2171                         goto found;
 2172         }
 2173         mtx_unlock(&sw_dev_mtx);
 2174         error = EINVAL;
 2175         goto done;
 2176 found:
 2177         mtx_unlock(&sw_dev_mtx);
 2178 #ifdef MAC
 2179         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2180         error = mac_check_system_swapoff(td->td_ucred, vp);
 2181         (void) VOP_UNLOCK(vp, 0, td);
 2182         if (error != 0)
 2183                 goto done;
 2184 #endif
 2185         
 2186         nblks = sp->sw_nblks;
 2187 
 2188         /*
 2189          * We can turn off this swap device safely only if the
 2190          * available virtual memory in the system will fit the amount
 2191          * of data we will have to page back in, plus an epsilon so
 2192          * the system doesn't become critically low on swap space.
 2193          */
 2194         if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
 2195             nblks + nswap_lowat) {
 2196                 error = ENOMEM;
 2197                 goto done;
 2198         }
 2199 
 2200         /*
 2201          * Prevent further allocations on this device.
 2202          */
 2203         mtx_lock(&sw_dev_mtx);
 2204         sp->sw_flags |= SW_CLOSING;
 2205         for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
 2206                 swap_pager_avail -= blist_fill(sp->sw_blist,
 2207                      dvbase, dmmax);
 2208         }
 2209         mtx_unlock(&sw_dev_mtx);
 2210 
 2211         /*
 2212          * Page in the contents of the device and close it.
 2213          */
 2214         swap_pager_swapoff(sp);
 2215 
 2216         sp->sw_close(td, sp);
 2217         sp->sw_id = NULL;
 2218         mtx_lock(&sw_dev_mtx);
 2219         TAILQ_REMOVE(&swtailq, sp, sw_list);
 2220         nswapdev--;
 2221         if (nswapdev == 0) {
 2222                 swap_pager_full = 2;
 2223                 swap_pager_almost_full = 1;
 2224         }
 2225         if (swdevhd == sp)
 2226                 swdevhd = NULL;
 2227         mtx_unlock(&sw_dev_mtx);
 2228         blist_destroy(sp->sw_blist);
 2229         free(sp, M_VMPGDATA);
 2230 
 2231 done:
 2232         swdev_syscall_active = 0;
 2233         wakeup_one(&swdev_syscall_active);
 2234 done2:
 2235         mtx_unlock(&Giant);
 2236         return (error);
 2237 }
 2238 
 2239 void
 2240 swap_pager_status(int *total, int *used)
 2241 {
 2242         struct swdevt *sp;
 2243 
 2244         *total = 0;
 2245         *used = 0;
 2246         mtx_lock(&sw_dev_mtx);
 2247         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2248                 *total += sp->sw_nblks;
 2249                 *used += sp->sw_used;
 2250         }
 2251         mtx_unlock(&sw_dev_mtx);
 2252 }
 2253 
 2254 static int
 2255 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 2256 {
 2257         int     *name = (int *)arg1;
 2258         int     error, n;
 2259         struct xswdev xs;
 2260         struct swdevt *sp;
 2261 
 2262         if (arg2 != 1) /* name length */
 2263                 return (EINVAL);
 2264 
 2265         n = 0;
 2266         mtx_lock(&sw_dev_mtx);
 2267         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2268                 if (n == *name) {
 2269                         mtx_unlock(&sw_dev_mtx);
 2270                         xs.xsw_version = XSWDEV_VERSION;
 2271                         xs.xsw_dev = sp->sw_dev;
 2272                         xs.xsw_flags = sp->sw_flags;
 2273                         xs.xsw_nblks = sp->sw_nblks;
 2274                         xs.xsw_used = sp->sw_used;
 2275 
 2276                         error = SYSCTL_OUT(req, &xs, sizeof(xs));
 2277                         return (error);
 2278                 }
 2279                 n++;
 2280         }
 2281         mtx_unlock(&sw_dev_mtx);
 2282         return (ENOENT);
 2283 }
 2284 
 2285 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
 2286     "Number of swap devices");
 2287 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
 2288     "Swap statistics by device");
 2289 
 2290 /*
 2291  * vmspace_swap_count() - count the approximate swap useage in pages for a
 2292  *                        vmspace.
 2293  *
 2294  *      The map must be locked.
 2295  *
 2296  *      Swap useage is determined by taking the proportional swap used by
 2297  *      VM objects backing the VM map.  To make up for fractional losses,
 2298  *      if the VM object has any swap use at all the associated map entries
 2299  *      count for at least 1 swap page.
 2300  */
 2301 int
 2302 vmspace_swap_count(struct vmspace *vmspace)
 2303 {
 2304         vm_map_t map = &vmspace->vm_map;
 2305         vm_map_entry_t cur;
 2306         int count = 0;
 2307 
 2308         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 2309                 vm_object_t object;
 2310 
 2311                 if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 2312                     (object = cur->object.vm_object) != NULL) {
 2313                         VM_OBJECT_LOCK(object);
 2314                         if (object->type == OBJT_SWAP &&
 2315                             object->un_pager.swp.swp_bcount != 0) {
 2316                                 int n = (cur->end - cur->start) / PAGE_SIZE;
 2317 
 2318                                 count += object->un_pager.swp.swp_bcount *
 2319                                     SWAP_META_PAGES * n / object->size + 1;
 2320                         }
 2321                         VM_OBJECT_UNLOCK(object);
 2322                 }
 2323         }
 2324         return (count);
 2325 }
 2326 
 2327 /*
 2328  * GEOM backend
 2329  *
 2330  * Swapping onto disk devices.
 2331  *
 2332  */
 2333 
 2334 static g_orphan_t swapgeom_orphan;
 2335 
 2336 static struct g_class g_swap_class = {
 2337         .name = "SWAP",
 2338         .version = G_VERSION,
 2339         .orphan = swapgeom_orphan,
 2340 };
 2341 
 2342 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 2343 
 2344 
 2345 static void
 2346 swapgeom_done(struct bio *bp2)
 2347 {
 2348         struct buf *bp;
 2349 
 2350         bp = bp2->bio_caller2;
 2351         if (bp2->bio_error)
 2352                 bp->b_ioflags |= BIO_ERROR;
 2353         bufdone(bp);
 2354         g_destroy_bio(bp2);
 2355 }
 2356 
 2357 static void
 2358 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 2359 {
 2360         struct bio *bio;
 2361         struct g_consumer *cp;
 2362 
 2363         cp = sp->sw_id;
 2364         if (cp == NULL) {
 2365                 bp->b_error = ENXIO;
 2366                 bp->b_ioflags |= BIO_ERROR;
 2367                 bufdone(bp);
 2368                 return;
 2369         }
 2370         bio = g_clone_bio(&bp->b_io);
 2371         if (bio == NULL) {
 2372                 /*
 2373                  * XXX: This is better than panicing, but not much better.
 2374                  * XXX: Somehow this should be retried.  A more generic
 2375                  * XXX: implementation of ENOMEM in geom may be able to cope.
 2376                  */
 2377                 bp->b_error = ENOMEM;
 2378                 bp->b_ioflags |= BIO_ERROR;
 2379                 bufdone(bp);
 2380                 return;
 2381         }
 2382         bio->bio_caller2 = bp;
 2383         bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 2384         bio->bio_length = bp->b_bcount;
 2385         bio->bio_done = swapgeom_done;
 2386         g_io_request(bio, cp);
 2387         return;
 2388 }
 2389 
 2390 static void
 2391 swapgeom_orphan(struct g_consumer *cp)
 2392 {
 2393         struct swdevt *sp;
 2394 
 2395         mtx_lock(&sw_dev_mtx);
 2396         TAILQ_FOREACH(sp, &swtailq, sw_list)
 2397                 if (sp->sw_id == cp)
 2398                         sp->sw_id = NULL;
 2399         mtx_unlock(&sw_dev_mtx);
 2400 }
 2401 
 2402 static void
 2403 swapgeom_close_ev(void *arg, int flags)
 2404 {
 2405         struct g_consumer *cp;
 2406 
 2407         cp = arg;
 2408         g_access(cp, -1, -1, 0);
 2409         g_detach(cp);
 2410         g_destroy_consumer(cp);
 2411 }
 2412 
 2413 static void
 2414 swapgeom_close(struct thread *td, struct swdevt *sw)
 2415 {
 2416 
 2417         /* XXX: direct call when Giant untangled */
 2418         g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
 2419 }
 2420 
 2421 
 2422 struct swh0h0 {
 2423         struct cdev *dev;
 2424         struct vnode *vp;
 2425         int     error;
 2426 };
 2427 
 2428 static void
 2429 swapongeom_ev(void *arg, int flags)
 2430 {
 2431         struct swh0h0 *swh;
 2432         struct g_provider *pp;
 2433         struct g_consumer *cp;
 2434         static struct g_geom *gp;
 2435         struct swdevt *sp;
 2436         u_long nblks;
 2437         int error;
 2438 
 2439         swh = arg;
 2440         swh->error = 0;
 2441         pp = g_dev_getprovider(swh->dev);
 2442         if (pp == NULL) {
 2443                 swh->error = ENODEV;
 2444                 return;
 2445         }
 2446         mtx_lock(&sw_dev_mtx);
 2447         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2448                 cp = sp->sw_id;
 2449                 if (cp != NULL && cp->provider == pp) {
 2450                         mtx_unlock(&sw_dev_mtx);
 2451                         swh->error = EBUSY;
 2452                         return;
 2453                 }
 2454         }
 2455         mtx_unlock(&sw_dev_mtx);
 2456         if (gp == NULL)
 2457                 gp = g_new_geomf(&g_swap_class, "swap", NULL);
 2458         cp = g_new_consumer(gp);
 2459         g_attach(cp, pp);
 2460         /*
 2461          * XXX: Everytime you think you can improve the margin for
 2462          * footshooting, somebody depends on the ability to do so:
 2463          * savecore(8) wants to write to our swapdev so we cannot
 2464          * set an exclusive count :-(
 2465          */
 2466         error = g_access(cp, 1, 1, 0);
 2467         if (error) {
 2468                 g_detach(cp);
 2469                 g_destroy_consumer(cp);
 2470                 swh->error = error;
 2471                 return;
 2472         }
 2473         nblks = pp->mediasize / DEV_BSIZE;
 2474         swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
 2475             swapgeom_close, dev2udev(swh->dev));
 2476         swh->error = 0;
 2477         return;
 2478 }
 2479 
 2480 static int
 2481 swapongeom(struct thread *td, struct vnode *vp)
 2482 {
 2483         int error;
 2484         struct swh0h0 swh;
 2485 
 2486         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2487 
 2488         swh.dev = vp->v_rdev;
 2489         swh.vp = vp;
 2490         swh.error = 0;
 2491         /* XXX: direct call when Giant untangled */
 2492         error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
 2493         if (!error)
 2494                 error = swh.error;
 2495         VOP_UNLOCK(vp, 0, td);
 2496         return (error);
 2497 }
 2498 
 2499 /*
 2500  * VNODE backend
 2501  *
 2502  * This is used mainly for network filesystem (read: probably only tested
 2503  * with NFS) swapfiles.
 2504  *
 2505  */
 2506 
 2507 static void
 2508 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 2509 {
 2510         int s;
 2511         struct vnode *vp, *vp2;
 2512 
 2513         bp->b_dev = NULL;
 2514         bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 2515 
 2516         vp2 = sp->sw_id;
 2517         vhold(vp2);
 2518         s = splvm();
 2519         if (bp->b_iocmd == BIO_WRITE) {
 2520                 vp = bp->b_vp;
 2521                 if (vp) {
 2522                         VI_LOCK(vp);
 2523                         vp->v_numoutput--;
 2524                         if ((vp->v_iflag & VI_BWAIT) && vp->v_numoutput <= 0) {
 2525                                 vp->v_iflag &= ~VI_BWAIT;
 2526                                 wakeup(&vp->v_numoutput);
 2527                         }
 2528                         VI_UNLOCK(vp);
 2529                 }
 2530                 VI_LOCK(vp2);
 2531                 vp2->v_numoutput++;
 2532                 VI_UNLOCK(vp2);
 2533         }
 2534         bp->b_vp = vp2;
 2535         splx(s);
 2536         bp->b_iooffset = dbtob(bp->b_blkno);
 2537         VOP_STRATEGY(vp2, bp);
 2538         return;
 2539 }
 2540 
 2541 static void
 2542 swapdev_close(struct thread *td, struct swdevt *sp)
 2543 {
 2544 
 2545         VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 2546         vrele(sp->sw_vp);
 2547 }
 2548 
 2549 
 2550 static int
 2551 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 2552 {
 2553         struct swdevt *sp;
 2554         int error;
 2555 
 2556         if (nblks == 0)
 2557                 return (ENXIO);
 2558         mtx_lock(&sw_dev_mtx);
 2559         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2560                 if (sp->sw_id == vp) {
 2561                         mtx_unlock(&sw_dev_mtx);
 2562                         return (EBUSY);
 2563                 }
 2564         }
 2565         mtx_unlock(&sw_dev_mtx);
 2566     
 2567         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2568 #ifdef MAC
 2569         error = mac_check_system_swapon(td->td_ucred, vp);
 2570         if (error == 0)
 2571 #endif
 2572                 error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, -1);
 2573         (void) VOP_UNLOCK(vp, 0, td);
 2574         if (error)
 2575                 return (error);
 2576 
 2577         swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 2578             NODEV);
 2579         return (0);
 2580 }

Cache object: af11d2fe8390ea4918c627ddcd07c3f1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.