The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/swap_pager.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1998 Matthew Dillon,
    3  * Copyright (c) 1994 John S. Dyson
    4  * Copyright (c) 1990 University of Utah.
    5  * Copyright (c) 1982, 1986, 1989, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * the Systems Programming Group of the University of Utah Computer
   10  * Science Department.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. All advertising materials mentioning features or use of this software
   21  *    must display the following acknowledgement:
   22  *      This product includes software developed by the University of
   23  *      California, Berkeley and its contributors.
   24  * 4. Neither the name of the University nor the names of its contributors
   25  *    may be used to endorse or promote products derived from this software
   26  *    without specific prior written permission.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   38  * SUCH DAMAGE.
   39  *
   40  *                              New Swap System
   41  *                              Matthew Dillon
   42  *
   43  * Radix Bitmap 'blists'.
   44  *
   45  *      - The new swapper uses the new radix bitmap code.  This should scale
   46  *        to arbitrarily small or arbitrarily large swap spaces and an almost
   47  *        arbitrary degree of fragmentation.
   48  *
   49  * Features:
   50  *
   51  *      - on the fly reallocation of swap during putpages.  The new system
   52  *        does not try to keep previously allocated swap blocks for dirty
   53  *        pages.  
   54  *
   55  *      - on the fly deallocation of swap
   56  *
   57  *      - No more garbage collection required.  Unnecessarily allocated swap
   58  *        blocks only exist for dirty vm_page_t's now and these are already
   59  *        cycled (in a high-load system) by the pager.  We also do on-the-fly
   60  *        removal of invalidated swap blocks when a page is destroyed
   61  *        or renamed.
   62  *
   63  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
   64  *
   65  *      @(#)swap_pager.c        8.9 (Berkeley) 3/21/94
   66  *      @(#)vm_swap.c   8.5 (Berkeley) 2/17/94
   67  */
   68 
   69 #include <sys/cdefs.h>
   70 __FBSDID("$FreeBSD: releng/5.3/sys/vm/swap_pager.c 135840 2004-09-27 04:53:18Z das $");
   71 
   72 #include "opt_mac.h"
   73 #include "opt_swap.h"
   74 #include "opt_vm.h"
   75 
   76 #include <sys/param.h>
   77 #include <sys/systm.h>
   78 #include <sys/conf.h>
   79 #include <sys/kernel.h>
   80 #include <sys/proc.h>
   81 #include <sys/bio.h>
   82 #include <sys/buf.h>
   83 #include <sys/disk.h>
   84 #include <sys/fcntl.h>
   85 #include <sys/mount.h>
   86 #include <sys/namei.h>
   87 #include <sys/vnode.h>
   88 #include <sys/mac.h>
   89 #include <sys/malloc.h>
   90 #include <sys/sysctl.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/blist.h>
   93 #include <sys/lock.h>
   94 #include <sys/sx.h>
   95 #include <sys/vmmeter.h>
   96 
   97 #include <vm/vm.h>
   98 #include <vm/pmap.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_kern.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_page.h>
  103 #include <vm/vm_pager.h>
  104 #include <vm/vm_pageout.h>
  105 #include <vm/vm_param.h>
  106 #include <vm/swap_pager.h>
  107 #include <vm/vm_extern.h>
  108 #include <vm/uma.h>
  109 
  110 #include <geom/geom.h>
  111 
  112 /*
  113  * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, or 16
  114  * pages per allocation.  We recommend you stick with the default of 8.
  115  * The 16-page limit is due to the radix code (kern/subr_blist.c).
  116  */
  117 #ifndef MAX_PAGEOUT_CLUSTER
  118 #define MAX_PAGEOUT_CLUSTER 16
  119 #endif
  120 
  121 #if !defined(SWB_NPAGES)
  122 #define SWB_NPAGES      MAX_PAGEOUT_CLUSTER
  123 #endif
  124 
  125 /*
  126  * Piecemeal swap metadata structure.  Swap is stored in a radix tree.
  127  *
  128  * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix
  129  * is basically 8.  Assuming PAGE_SIZE == 4096, one tree level represents
  130  * 32K worth of data, two levels represent 256K, three levels represent
  131  * 2 MBytes.   This is acceptable.
  132  *
  133  * Overall memory utilization is about the same as the old swap structure.
  134  */
  135 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
  136 #define SWAP_META_PAGES         (SWB_NPAGES * 2)
  137 #define SWAP_META_MASK          (SWAP_META_PAGES - 1)
  138 
  139 typedef int32_t swblk_t;        /*
  140                                  * swap offset.  This is the type used to
  141                                  * address the "virtual swap device" and
  142                                  * therefore the maximum swap space is
  143                                  * 2^32 pages.
  144                                  */
  145 
  146 struct swdevt;
  147 typedef void sw_strategy_t(struct buf *bp, struct swdevt *sw);
  148 typedef void sw_close_t(struct thread *td, struct swdevt *sw);
  149 
  150 /*
  151  * Swap device table
  152  */
  153 struct swdevt {
  154         int     sw_flags;
  155         int     sw_nblks;
  156         int     sw_used;
  157         dev_t   sw_dev;
  158         struct vnode *sw_vp;
  159         void    *sw_id;
  160         swblk_t sw_first;
  161         swblk_t sw_end;
  162         struct blist *sw_blist;
  163         TAILQ_ENTRY(swdevt)     sw_list;
  164         sw_strategy_t           *sw_strategy;
  165         sw_close_t              *sw_close;
  166 };
  167 
  168 #define SW_CLOSING      0x04
  169 
  170 struct swblock {
  171         struct swblock  *swb_hnext;
  172         vm_object_t     swb_object;
  173         vm_pindex_t     swb_index;
  174         int             swb_count;
  175         daddr_t         swb_pages[SWAP_META_PAGES];
  176 };
  177 
  178 static struct mtx sw_dev_mtx;
  179 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
  180 static struct swdevt *swdevhd;  /* Allocate from here next */
  181 static int nswapdev;            /* Number of swap devices */
  182 int swap_pager_avail;
  183 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
  184 
  185 static void swapdev_strategy(struct buf *, struct swdevt *sw);
  186 
  187 #define SWM_FREE        0x02    /* free, period                 */
  188 #define SWM_POP         0x04    /* pop out                      */
  189 
  190 int swap_pager_full = 2;        /* swap space exhaustion (task killing) */
  191 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
  192 static int nsw_rcount;          /* free read buffers                    */
  193 static int nsw_wcount_sync;     /* limit write buffers / synchronous    */
  194 static int nsw_wcount_async;    /* limit write buffers / asynchronous   */
  195 static int nsw_wcount_async_max;/* assigned maximum                     */
  196 static int nsw_cluster_max;     /* maximum VOP I/O allowed              */
  197 
  198 static struct swblock **swhash;
  199 static int swhash_mask;
  200 static struct mtx swhash_mtx;
  201 
  202 static int swap_async_max = 4;  /* maximum in-progress async I/O's      */
  203 static struct sx sw_alloc_sx;
  204 
  205 
  206 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
  207         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
  208 
  209 /*
  210  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  211  * of searching a named list by hashing it just a little.
  212  */
  213 
  214 #define NOBJLISTS               8
  215 
  216 #define NOBJLIST(handle)        \
  217         (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
  218 
  219 static struct mtx sw_alloc_mtx; /* protect list manipulation */ 
  220 static struct pagerlst  swap_pager_object_list[NOBJLISTS];
  221 static uma_zone_t       swap_zone;
  222 static struct vm_object swap_zone_obj;
  223 
  224 /*
  225  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  226  * calls hooked from other parts of the VM system and do not appear here.
  227  * (see vm/swap_pager.h).
  228  */
  229 static vm_object_t
  230                 swap_pager_alloc(void *handle, vm_ooffset_t size,
  231                                       vm_prot_t prot, vm_ooffset_t offset);
  232 static void     swap_pager_dealloc(vm_object_t object);
  233 static int      swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
  234 static void     swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
  235 static boolean_t
  236                 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
  237 static void     swap_pager_init(void);
  238 static void     swap_pager_unswapped(vm_page_t);
  239 static void     swap_pager_swapoff(struct swdevt *sp, int *sw_used);
  240 
  241 struct pagerops swappagerops = {
  242         .pgo_init =     swap_pager_init,        /* early system initialization of pager */
  243         .pgo_alloc =    swap_pager_alloc,       /* allocate an OBJT_SWAP object         */
  244         .pgo_dealloc =  swap_pager_dealloc,     /* deallocate an OBJT_SWAP object       */
  245         .pgo_getpages = swap_pager_getpages,    /* pagein                               */
  246         .pgo_putpages = swap_pager_putpages,    /* pageout                              */
  247         .pgo_haspage =  swap_pager_haspage,     /* get backing store status for page    */
  248         .pgo_pageunswapped = swap_pager_unswapped,      /* remove swap related to page          */
  249 };
  250 
  251 /*
  252  * dmmax is in page-sized chunks with the new swap system.  It was
  253  * dev-bsized chunks in the old.  dmmax is always a power of 2.
  254  *
  255  * swap_*() routines are externally accessible.  swp_*() routines are
  256  * internal.
  257  */
  258 static int dmmax;
  259 static int nswap_lowat = 128;   /* in pages, swap_pager_almost_full warn */
  260 static int nswap_hiwat = 512;   /* in pages, swap_pager_almost_full warn */
  261 
  262 SYSCTL_INT(_vm, OID_AUTO, dmmax,
  263         CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
  264 
  265 static void     swp_sizecheck(void);
  266 static void     swp_pager_async_iodone(struct buf *bp);
  267 static int      swapongeom(struct thread *, struct vnode *);
  268 static int      swaponvp(struct thread *, struct vnode *, u_long);
  269 
  270 /*
  271  * Swap bitmap functions
  272  */
  273 static void     swp_pager_freeswapspace(daddr_t blk, int npages);
  274 static daddr_t  swp_pager_getswapspace(int npages);
  275 
  276 /*
  277  * Metadata functions
  278  */
  279 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
  280 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
  281 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
  282 static void swp_pager_meta_free_all(vm_object_t);
  283 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
  284 
  285 /*
  286  * SWP_SIZECHECK() -    update swap_pager_full indication
  287  *      
  288  *      update the swap_pager_almost_full indication and warn when we are
  289  *      about to run out of swap space, using lowat/hiwat hysteresis.
  290  *
  291  *      Clear swap_pager_full ( task killing ) indication when lowat is met.
  292  *
  293  *      No restrictions on call
  294  *      This routine may not block.
  295  *      This routine must be called at splvm()
  296  */
  297 static void
  298 swp_sizecheck(void)
  299 {
  300 
  301         if (swap_pager_avail < nswap_lowat) {
  302                 if (swap_pager_almost_full == 0) {
  303                         printf("swap_pager: out of swap space\n");
  304                         swap_pager_almost_full = 1;
  305                 }
  306         } else {
  307                 swap_pager_full = 0;
  308                 if (swap_pager_avail > nswap_hiwat)
  309                         swap_pager_almost_full = 0;
  310         }
  311 }
  312 
  313 /*
  314  * SWP_PAGER_HASH() -   hash swap meta data
  315  *
  316  *      This is an helper function which hashes the swapblk given
  317  *      the object and page index.  It returns a pointer to a pointer
  318  *      to the object, or a pointer to a NULL pointer if it could not
  319  *      find a swapblk.
  320  *
  321  *      This routine must be called at splvm().
  322  */
  323 static struct swblock **
  324 swp_pager_hash(vm_object_t object, vm_pindex_t index)
  325 {
  326         struct swblock **pswap;
  327         struct swblock *swap;
  328 
  329         index &= ~(vm_pindex_t)SWAP_META_MASK;
  330         pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
  331         while ((swap = *pswap) != NULL) {
  332                 if (swap->swb_object == object &&
  333                     swap->swb_index == index
  334                 ) {
  335                         break;
  336                 }
  337                 pswap = &swap->swb_hnext;
  338         }
  339         return (pswap);
  340 }
  341 
  342 /*
  343  * SWAP_PAGER_INIT() -  initialize the swap pager!
  344  *
  345  *      Expected to be started from system init.  NOTE:  This code is run 
  346  *      before much else so be careful what you depend on.  Most of the VM
  347  *      system has yet to be initialized at this point.
  348  */
  349 static void
  350 swap_pager_init(void)
  351 {
  352         /*
  353          * Initialize object lists
  354          */
  355         int i;
  356 
  357         for (i = 0; i < NOBJLISTS; ++i)
  358                 TAILQ_INIT(&swap_pager_object_list[i]);
  359         mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
  360         mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
  361 
  362         /*
  363          * Device Stripe, in PAGE_SIZE'd blocks
  364          */
  365         dmmax = SWB_NPAGES * 2;
  366 }
  367 
  368 /*
  369  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  370  *
  371  *      Expected to be started from pageout process once, prior to entering
  372  *      its main loop.
  373  */
  374 void
  375 swap_pager_swap_init(void)
  376 {
  377         int n, n2;
  378 
  379         /*
  380          * Number of in-transit swap bp operations.  Don't
  381          * exhaust the pbufs completely.  Make sure we
  382          * initialize workable values (0 will work for hysteresis
  383          * but it isn't very efficient).
  384          *
  385          * The nsw_cluster_max is constrained by the bp->b_pages[]
  386          * array (MAXPHYS/PAGE_SIZE) and our locally defined
  387          * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
  388          * constrained by the swap device interleave stripe size.
  389          *
  390          * Currently we hardwire nsw_wcount_async to 4.  This limit is 
  391          * designed to prevent other I/O from having high latencies due to
  392          * our pageout I/O.  The value 4 works well for one or two active swap
  393          * devices but is probably a little low if you have more.  Even so,
  394          * a higher value would probably generate only a limited improvement
  395          * with three or four active swap devices since the system does not
  396          * typically have to pageout at extreme bandwidths.   We will want
  397          * at least 2 per swap devices, and 4 is a pretty good value if you
  398          * have one NFS swap device due to the command/ack latency over NFS.
  399          * So it all works out pretty well.
  400          */
  401         nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
  402 
  403         mtx_lock(&pbuf_mtx);
  404         nsw_rcount = (nswbuf + 1) / 2;
  405         nsw_wcount_sync = (nswbuf + 3) / 4;
  406         nsw_wcount_async = 4;
  407         nsw_wcount_async_max = nsw_wcount_async;
  408         mtx_unlock(&pbuf_mtx);
  409 
  410         /*
  411          * Initialize our zone.  Right now I'm just guessing on the number
  412          * we need based on the number of pages in the system.  Each swblock
  413          * can hold 16 pages, so this is probably overkill.  This reservation
  414          * is typically limited to around 32MB by default.
  415          */
  416         n = cnt.v_page_count / 2;
  417         if (maxswzone && n > maxswzone / sizeof(struct swblock))
  418                 n = maxswzone / sizeof(struct swblock);
  419         n2 = n;
  420         swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
  421             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
  422         do {
  423                 if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
  424                         break;
  425                 /*
  426                  * if the allocation failed, try a zone two thirds the
  427                  * size of the previous attempt.
  428                  */
  429                 n -= ((n + 2) / 3);
  430         } while (n > 0);
  431         if (swap_zone == NULL)
  432                 panic("failed to create swap_zone.");
  433         if (n2 != n)
  434                 printf("Swap zone entries reduced from %d to %d.\n", n2, n);
  435         n2 = n;
  436 
  437         /*
  438          * Initialize our meta-data hash table.  The swapper does not need to
  439          * be quite as efficient as the VM system, so we do not use an 
  440          * oversized hash table.
  441          *
  442          *      n:              size of hash table, must be power of 2
  443          *      swhash_mask:    hash table index mask
  444          */
  445         for (n = 1; n < n2 / 8; n *= 2)
  446                 ;
  447         swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
  448         swhash_mask = n - 1;
  449         mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
  450 }
  451 
  452 /*
  453  * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
  454  *                      its metadata structures.
  455  *
  456  *      This routine is called from the mmap and fork code to create a new
  457  *      OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
  458  *      and then converting it with swp_pager_meta_build().
  459  *
  460  *      This routine may block in vm_object_allocate() and create a named
  461  *      object lookup race, so we must interlock.   We must also run at
  462  *      splvm() for the object lookup to handle races with interrupts, but
  463  *      we do not have to maintain splvm() in between the lookup and the
  464  *      add because (I believe) it is not possible to attempt to create
  465  *      a new swap object w/handle when a default object with that handle
  466  *      already exists.
  467  *
  468  * MPSAFE
  469  */
  470 static vm_object_t
  471 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
  472                  vm_ooffset_t offset)
  473 {
  474         vm_object_t object;
  475         vm_pindex_t pindex;
  476 
  477         pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
  478 
  479         if (handle) {
  480                 mtx_lock(&Giant);
  481                 /*
  482                  * Reference existing named region or allocate new one.  There
  483                  * should not be a race here against swp_pager_meta_build()
  484                  * as called from vm_page_remove() in regards to the lookup
  485                  * of the handle.
  486                  */
  487                 sx_xlock(&sw_alloc_sx);
  488                 object = vm_pager_object_lookup(NOBJLIST(handle), handle);
  489 
  490                 if (object != NULL) {
  491                         vm_object_reference(object);
  492                 } else {
  493                         object = vm_object_allocate(OBJT_DEFAULT, pindex);
  494                         object->handle = handle;
  495 
  496                         VM_OBJECT_LOCK(object);
  497                         swp_pager_meta_build(object, 0, SWAPBLK_NONE);
  498                         VM_OBJECT_UNLOCK(object);
  499                 }
  500                 sx_xunlock(&sw_alloc_sx);
  501                 mtx_unlock(&Giant);
  502         } else {
  503                 object = vm_object_allocate(OBJT_DEFAULT, pindex);
  504 
  505                 VM_OBJECT_LOCK(object);
  506                 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
  507                 VM_OBJECT_UNLOCK(object);
  508         }
  509         return (object);
  510 }
  511 
  512 /*
  513  * SWAP_PAGER_DEALLOC() -       remove swap metadata from object
  514  *
  515  *      The swap backing for the object is destroyed.  The code is 
  516  *      designed such that we can reinstantiate it later, but this
  517  *      routine is typically called only when the entire object is
  518  *      about to be destroyed.
  519  *
  520  *      This routine may block, but no longer does. 
  521  *
  522  *      The object must be locked or unreferenceable.
  523  */
  524 static void
  525 swap_pager_dealloc(vm_object_t object)
  526 {
  527         int s;
  528 
  529         /*
  530          * Remove from list right away so lookups will fail if we block for
  531          * pageout completion.
  532          */
  533         if (object->handle != NULL) {
  534                 mtx_lock(&sw_alloc_mtx);
  535                 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
  536                 mtx_unlock(&sw_alloc_mtx);
  537         }
  538 
  539         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  540         vm_object_pip_wait(object, "swpdea");
  541 
  542         /*
  543          * Free all remaining metadata.  We only bother to free it from 
  544          * the swap meta data.  We do not attempt to free swapblk's still
  545          * associated with vm_page_t's for this object.  We do not care
  546          * if paging is still in progress on some objects.
  547          */
  548         s = splvm();
  549         swp_pager_meta_free_all(object);
  550         splx(s);
  551 }
  552 
  553 /************************************************************************
  554  *                      SWAP PAGER BITMAP ROUTINES                      *
  555  ************************************************************************/
  556 
  557 /*
  558  * SWP_PAGER_GETSWAPSPACE() -   allocate raw swap space
  559  *
  560  *      Allocate swap for the requested number of pages.  The starting
  561  *      swap block number (a page index) is returned or SWAPBLK_NONE
  562  *      if the allocation failed.
  563  *
  564  *      Also has the side effect of advising that somebody made a mistake
  565  *      when they configured swap and didn't configure enough.
  566  *
  567  *      Must be called at splvm() to avoid races with bitmap frees from
  568  *      vm_page_remove() aka swap_pager_page_removed().
  569  *
  570  *      This routine may not block
  571  *      This routine must be called at splvm().
  572  *
  573  *      We allocate in round-robin fashion from the configured devices.
  574  */
  575 static daddr_t
  576 swp_pager_getswapspace(int npages)
  577 {
  578         daddr_t blk;
  579         struct swdevt *sp;
  580         int i;
  581 
  582         blk = SWAPBLK_NONE;
  583         mtx_lock(&sw_dev_mtx);
  584         sp = swdevhd;
  585         for (i = 0; i < nswapdev; i++) {
  586                 if (sp == NULL)
  587                         sp = TAILQ_FIRST(&swtailq);
  588                 if (!(sp->sw_flags & SW_CLOSING)) {
  589                         blk = blist_alloc(sp->sw_blist, npages);
  590                         if (blk != SWAPBLK_NONE) {
  591                                 blk += sp->sw_first;
  592                                 sp->sw_used += npages;
  593                                 swap_pager_avail -= npages;
  594                                 swp_sizecheck();
  595                                 swdevhd = TAILQ_NEXT(sp, sw_list);
  596                                 goto done;
  597                         }
  598                 }
  599                 sp = TAILQ_NEXT(sp, sw_list);
  600         }
  601         if (swap_pager_full != 2) {
  602                 printf("swap_pager_getswapspace(%d): failed\n", npages);
  603                 swap_pager_full = 2;
  604                 swap_pager_almost_full = 1;
  605         }
  606         swdevhd = NULL;
  607 done:
  608         mtx_unlock(&sw_dev_mtx);
  609         return (blk);
  610 }
  611 
  612 static struct swdevt *
  613 swp_pager_find_dev(daddr_t blk)
  614 {
  615         struct swdevt *sp;
  616 
  617         mtx_lock(&sw_dev_mtx);
  618         TAILQ_FOREACH(sp, &swtailq, sw_list) {
  619                 if (blk >= sp->sw_first && blk < sp->sw_end) {
  620                         mtx_unlock(&sw_dev_mtx);
  621                         return (sp);
  622                 }
  623         }
  624         panic("Swapdev not found");
  625 }
  626         
  627 static void
  628 swp_pager_strategy(struct buf *bp)
  629 {
  630         struct swdevt *sp;
  631 
  632         mtx_lock(&sw_dev_mtx);
  633         TAILQ_FOREACH(sp, &swtailq, sw_list) {
  634                 if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
  635                         mtx_unlock(&sw_dev_mtx);
  636                         sp->sw_strategy(bp, sp);
  637                         return;
  638                 }
  639         }
  640         panic("Swapdev not found");
  641 }
  642         
  643 
  644 /*
  645  * SWP_PAGER_FREESWAPSPACE() -  free raw swap space 
  646  *
  647  *      This routine returns the specified swap blocks back to the bitmap.
  648  *
  649  *      Note:  This routine may not block (it could in the old swap code),
  650  *      and through the use of the new blist routines it does not block.
  651  *
  652  *      We must be called at splvm() to avoid races with bitmap frees from
  653  *      vm_page_remove() aka swap_pager_page_removed().
  654  *
  655  *      This routine may not block
  656  *      This routine must be called at splvm().
  657  */
  658 static void
  659 swp_pager_freeswapspace(daddr_t blk, int npages)
  660 {
  661         struct swdevt *sp;
  662 
  663         mtx_lock(&sw_dev_mtx);
  664         TAILQ_FOREACH(sp, &swtailq, sw_list) {
  665                 if (blk >= sp->sw_first && blk < sp->sw_end) {
  666                         sp->sw_used -= npages;
  667                         /*
  668                          * If we are attempting to stop swapping on
  669                          * this device, we don't want to mark any
  670                          * blocks free lest they be reused.  
  671                          */
  672                         if ((sp->sw_flags & SW_CLOSING) == 0) {
  673                                 blist_free(sp->sw_blist, blk - sp->sw_first,
  674                                     npages);
  675                                 swap_pager_avail += npages;
  676                                 swp_sizecheck();
  677                         }
  678                         mtx_unlock(&sw_dev_mtx);
  679                         return;
  680                 }
  681         }
  682         panic("Swapdev not found");
  683 }
  684 
  685 /*
  686  * SWAP_PAGER_FREESPACE() -     frees swap blocks associated with a page
  687  *                              range within an object.
  688  *
  689  *      This is a globally accessible routine.
  690  *
  691  *      This routine removes swapblk assignments from swap metadata.
  692  *
  693  *      The external callers of this routine typically have already destroyed 
  694  *      or renamed vm_page_t's associated with this range in the object so 
  695  *      we should be ok.
  696  *
  697  *      This routine may be called at any spl.  We up our spl to splvm temporarily
  698  *      in order to perform the metadata removal.
  699  */
  700 void
  701 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
  702 {
  703         int s = splvm();
  704 
  705         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  706         swp_pager_meta_free(object, start, size);
  707         splx(s);
  708 }
  709 
  710 /*
  711  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  712  *
  713  *      Assigns swap blocks to the specified range within the object.  The 
  714  *      swap blocks are not zerod.  Any previous swap assignment is destroyed.
  715  *
  716  *      Returns 0 on success, -1 on failure.
  717  */
  718 int
  719 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
  720 {
  721         int s;
  722         int n = 0;
  723         daddr_t blk = SWAPBLK_NONE;
  724         vm_pindex_t beg = start;        /* save start index */
  725 
  726         s = splvm();
  727         VM_OBJECT_LOCK(object);
  728         while (size) {
  729                 if (n == 0) {
  730                         n = BLIST_MAX_ALLOC;
  731                         while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
  732                                 n >>= 1;
  733                                 if (n == 0) {
  734                                         swp_pager_meta_free(object, beg, start - beg);
  735                                         VM_OBJECT_UNLOCK(object);
  736                                         splx(s);
  737                                         return (-1);
  738                                 }
  739                         }
  740                 }
  741                 swp_pager_meta_build(object, start, blk);
  742                 --size;
  743                 ++start;
  744                 ++blk;
  745                 --n;
  746         }
  747         swp_pager_meta_free(object, start, n);
  748         VM_OBJECT_UNLOCK(object);
  749         splx(s);
  750         return (0);
  751 }
  752 
  753 /*
  754  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  755  *                      and destroy the source.
  756  *
  757  *      Copy any valid swapblks from the source to the destination.  In
  758  *      cases where both the source and destination have a valid swapblk,
  759  *      we keep the destination's.
  760  *
  761  *      This routine is allowed to block.  It may block allocating metadata
  762  *      indirectly through swp_pager_meta_build() or if paging is still in
  763  *      progress on the source. 
  764  *
  765  *      This routine can be called at any spl
  766  *
  767  *      XXX vm_page_collapse() kinda expects us not to block because we 
  768  *      supposedly do not need to allocate memory, but for the moment we
  769  *      *may* have to get a little memory from the zone allocator, but
  770  *      it is taken from the interrupt memory.  We should be ok. 
  771  *
  772  *      The source object contains no vm_page_t's (which is just as well)
  773  *
  774  *      The source object is of type OBJT_SWAP.
  775  *
  776  *      The source and destination objects must be locked or 
  777  *      inaccessible (XXX are they ?)
  778  */
  779 void
  780 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
  781     vm_pindex_t offset, int destroysource)
  782 {
  783         vm_pindex_t i;
  784         int s;
  785 
  786         VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
  787         VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
  788 
  789         s = splvm();
  790         /*
  791          * If destroysource is set, we remove the source object from the 
  792          * swap_pager internal queue now. 
  793          */
  794         if (destroysource) {
  795                 if (srcobject->handle != NULL) {
  796                         mtx_lock(&sw_alloc_mtx);
  797                         TAILQ_REMOVE(
  798                             NOBJLIST(srcobject->handle),
  799                             srcobject,
  800                             pager_object_list
  801                         );
  802                         mtx_unlock(&sw_alloc_mtx);
  803                 }
  804         }
  805 
  806         /*
  807          * transfer source to destination.
  808          */
  809         for (i = 0; i < dstobject->size; ++i) {
  810                 daddr_t dstaddr;
  811 
  812                 /*
  813                  * Locate (without changing) the swapblk on the destination,
  814                  * unless it is invalid in which case free it silently, or
  815                  * if the destination is a resident page, in which case the
  816                  * source is thrown away.
  817                  */
  818                 dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
  819 
  820                 if (dstaddr == SWAPBLK_NONE) {
  821                         /*
  822                          * Destination has no swapblk and is not resident,
  823                          * copy source.
  824                          */
  825                         daddr_t srcaddr;
  826 
  827                         srcaddr = swp_pager_meta_ctl(
  828                             srcobject, 
  829                             i + offset,
  830                             SWM_POP
  831                         );
  832 
  833                         if (srcaddr != SWAPBLK_NONE) {
  834                                 /*
  835                                  * swp_pager_meta_build() can sleep.
  836                                  */
  837                                 vm_object_pip_add(srcobject, 1);
  838                                 VM_OBJECT_UNLOCK(srcobject);
  839                                 vm_object_pip_add(dstobject, 1);
  840                                 swp_pager_meta_build(dstobject, i, srcaddr);
  841                                 vm_object_pip_wakeup(dstobject);
  842                                 VM_OBJECT_LOCK(srcobject);
  843                                 vm_object_pip_wakeup(srcobject);
  844                         }
  845                 } else {
  846                         /*
  847                          * Destination has valid swapblk or it is represented
  848                          * by a resident page.  We destroy the sourceblock.
  849                          */
  850                         
  851                         swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
  852                 }
  853         }
  854 
  855         /*
  856          * Free left over swap blocks in source.
  857          *
  858          * We have to revert the type to OBJT_DEFAULT so we do not accidently
  859          * double-remove the object from the swap queues.
  860          */
  861         if (destroysource) {
  862                 swp_pager_meta_free_all(srcobject);
  863                 /*
  864                  * Reverting the type is not necessary, the caller is going
  865                  * to destroy srcobject directly, but I'm doing it here
  866                  * for consistency since we've removed the object from its
  867                  * queues.
  868                  */
  869                 srcobject->type = OBJT_DEFAULT;
  870         }
  871         splx(s);
  872 }
  873 
  874 /*
  875  * SWAP_PAGER_HASPAGE() -       determine if we have good backing store for
  876  *                              the requested page.
  877  *
  878  *      We determine whether good backing store exists for the requested
  879  *      page and return TRUE if it does, FALSE if it doesn't.
  880  *
  881  *      If TRUE, we also try to determine how much valid, contiguous backing
  882  *      store exists before and after the requested page within a reasonable
  883  *      distance.  We do not try to restrict it to the swap device stripe
  884  *      (that is handled in getpages/putpages).  It probably isn't worth
  885  *      doing here.
  886  */
  887 static boolean_t
  888 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
  889 {
  890         daddr_t blk0;
  891         int s;
  892 
  893         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  894         /*
  895          * do we have good backing store at the requested index ?
  896          */
  897         s = splvm();
  898         blk0 = swp_pager_meta_ctl(object, pindex, 0);
  899 
  900         if (blk0 == SWAPBLK_NONE) {
  901                 splx(s);
  902                 if (before)
  903                         *before = 0;
  904                 if (after)
  905                         *after = 0;
  906                 return (FALSE);
  907         }
  908 
  909         /*
  910          * find backwards-looking contiguous good backing store
  911          */
  912         if (before != NULL) {
  913                 int i;
  914 
  915                 for (i = 1; i < (SWB_NPAGES/2); ++i) {
  916                         daddr_t blk;
  917 
  918                         if (i > pindex)
  919                                 break;
  920                         blk = swp_pager_meta_ctl(object, pindex - i, 0);
  921                         if (blk != blk0 - i)
  922                                 break;
  923                 }
  924                 *before = (i - 1);
  925         }
  926 
  927         /*
  928          * find forward-looking contiguous good backing store
  929          */
  930         if (after != NULL) {
  931                 int i;
  932 
  933                 for (i = 1; i < (SWB_NPAGES/2); ++i) {
  934                         daddr_t blk;
  935 
  936                         blk = swp_pager_meta_ctl(object, pindex + i, 0);
  937                         if (blk != blk0 + i)
  938                                 break;
  939                 }
  940                 *after = (i - 1);
  941         }
  942         splx(s);
  943         return (TRUE);
  944 }
  945 
  946 /*
  947  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  948  *
  949  *      This removes any associated swap backing store, whether valid or
  950  *      not, from the page.  
  951  *
  952  *      This routine is typically called when a page is made dirty, at
  953  *      which point any associated swap can be freed.  MADV_FREE also
  954  *      calls us in a special-case situation
  955  *
  956  *      NOTE!!!  If the page is clean and the swap was valid, the caller
  957  *      should make the page dirty before calling this routine.  This routine
  958  *      does NOT change the m->dirty status of the page.  Also: MADV_FREE
  959  *      depends on it.
  960  *
  961  *      This routine may not block
  962  *      This routine must be called at splvm()
  963  */
  964 static void
  965 swap_pager_unswapped(vm_page_t m)
  966 {
  967 
  968         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  969         swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
  970 }
  971 
  972 /*
  973  * SWAP_PAGER_GETPAGES() - bring pages in from swap
  974  *
  975  *      Attempt to retrieve (m, count) pages from backing store, but make
  976  *      sure we retrieve at least m[reqpage].  We try to load in as large
  977  *      a chunk surrounding m[reqpage] as is contiguous in swap and which
  978  *      belongs to the same object.
  979  *
  980  *      The code is designed for asynchronous operation and 
  981  *      immediate-notification of 'reqpage' but tends not to be
  982  *      used that way.  Please do not optimize-out this algorithmic
  983  *      feature, I intend to improve on it in the future.
  984  *
  985  *      The parent has a single vm_object_pip_add() reference prior to
  986  *      calling us and we should return with the same.
  987  *
  988  *      The parent has BUSY'd the pages.  We should return with 'm'
  989  *      left busy, but the others adjusted.
  990  */
  991 static int
  992 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
  993 {
  994         struct buf *bp;
  995         vm_page_t mreq;
  996         int s;
  997         int i;
  998         int j;
  999         daddr_t blk;
 1000 
 1001         mreq = m[reqpage];
 1002 
 1003         KASSERT(mreq->object == object,
 1004             ("swap_pager_getpages: object mismatch %p/%p",
 1005             object, mreq->object));
 1006 
 1007         /*
 1008          * Calculate range to retrieve.  The pages have already been assigned
 1009          * their swapblks.  We require a *contiguous* range but we know it to
 1010          * not span devices.   If we do not supply it, bad things
 1011          * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 
 1012          * loops are set up such that the case(s) are handled implicitly.
 1013          *
 1014          * The swp_*() calls must be made at splvm().  vm_page_free() does
 1015          * not need to be, but it will go a little faster if it is.
 1016          */
 1017         s = splvm();
 1018         blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
 1019 
 1020         for (i = reqpage - 1; i >= 0; --i) {
 1021                 daddr_t iblk;
 1022 
 1023                 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
 1024                 if (blk != iblk + (reqpage - i))
 1025                         break;
 1026         }
 1027         ++i;
 1028 
 1029         for (j = reqpage + 1; j < count; ++j) {
 1030                 daddr_t jblk;
 1031 
 1032                 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
 1033                 if (blk != jblk - (j - reqpage))
 1034                         break;
 1035         }
 1036 
 1037         /*
 1038          * free pages outside our collection range.   Note: we never free
 1039          * mreq, it must remain busy throughout.
 1040          */
 1041         vm_page_lock_queues();
 1042         {
 1043                 int k;
 1044 
 1045                 for (k = 0; k < i; ++k)
 1046                         vm_page_free(m[k]);
 1047                 for (k = j; k < count; ++k)
 1048                         vm_page_free(m[k]);
 1049         }
 1050         vm_page_unlock_queues();
 1051         splx(s);
 1052 
 1053 
 1054         /*
 1055          * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq 
 1056          * still busy, but the others unbusied.
 1057          */
 1058         if (blk == SWAPBLK_NONE)
 1059                 return (VM_PAGER_FAIL);
 1060 
 1061         /*
 1062          * Getpbuf() can sleep.
 1063          */
 1064         VM_OBJECT_UNLOCK(object);
 1065         /*
 1066          * Get a swap buffer header to perform the IO
 1067          */
 1068         bp = getpbuf(&nsw_rcount);
 1069         bp->b_flags |= B_PAGING;
 1070 
 1071         /*
 1072          * map our page(s) into kva for input
 1073          */
 1074         pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
 1075 
 1076         bp->b_iocmd = BIO_READ;
 1077         bp->b_iodone = swp_pager_async_iodone;
 1078         bp->b_rcred = crhold(thread0.td_ucred);
 1079         bp->b_wcred = crhold(thread0.td_ucred);
 1080         bp->b_blkno = blk - (reqpage - i);
 1081         bp->b_bcount = PAGE_SIZE * (j - i);
 1082         bp->b_bufsize = PAGE_SIZE * (j - i);
 1083         bp->b_pager.pg_reqpage = reqpage - i;
 1084 
 1085         VM_OBJECT_LOCK(object);
 1086         vm_page_lock_queues();
 1087         {
 1088                 int k;
 1089 
 1090                 for (k = i; k < j; ++k) {
 1091                         bp->b_pages[k - i] = m[k];
 1092                         vm_page_flag_set(m[k], PG_SWAPINPROG);
 1093                 }
 1094         }
 1095         vm_page_unlock_queues();
 1096         VM_OBJECT_UNLOCK(object);
 1097         bp->b_npages = j - i;
 1098 
 1099         cnt.v_swapin++;
 1100         cnt.v_swappgsin += bp->b_npages;
 1101 
 1102         /*
 1103          * We still hold the lock on mreq, and our automatic completion routine
 1104          * does not remove it.
 1105          */
 1106         VM_OBJECT_LOCK(mreq->object);
 1107         vm_object_pip_add(mreq->object, bp->b_npages);
 1108         VM_OBJECT_UNLOCK(mreq->object);
 1109 
 1110         /*
 1111          * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 1112          * this point because we automatically release it on completion.
 1113          * Instead, we look at the one page we are interested in which we
 1114          * still hold a lock on even through the I/O completion.
 1115          *
 1116          * The other pages in our m[] array are also released on completion,
 1117          * so we cannot assume they are valid anymore either.
 1118          *
 1119          * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1120          */
 1121         BUF_KERNPROC(bp);
 1122         swp_pager_strategy(bp);
 1123 
 1124         /*
 1125          * wait for the page we want to complete.  PG_SWAPINPROG is always
 1126          * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 1127          * is set in the meta-data.
 1128          */
 1129         s = splvm();
 1130         vm_page_lock_queues();
 1131         while ((mreq->flags & PG_SWAPINPROG) != 0) {
 1132                 vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
 1133                 cnt.v_intrans++;
 1134                 if (msleep(mreq, &vm_page_queue_mtx, PSWP, "swread", hz*20)) {
 1135                         printf(
 1136 "swap_pager: indefinite wait buffer: device: %s, blkno: %jd, size: %ld\n",
 1137                             bp->b_dev == NULL ? "[NULL]" : devtoname(bp->b_dev),
 1138                             (intmax_t)bp->b_blkno, bp->b_bcount);
 1139                 }
 1140         }
 1141         vm_page_unlock_queues();
 1142         splx(s);
 1143 
 1144         VM_OBJECT_LOCK(mreq->object);
 1145         /*
 1146          * mreq is left busied after completion, but all the other pages
 1147          * are freed.  If we had an unrecoverable read error the page will
 1148          * not be valid.
 1149          */
 1150         if (mreq->valid != VM_PAGE_BITS_ALL) {
 1151                 return (VM_PAGER_ERROR);
 1152         } else {
 1153                 return (VM_PAGER_OK);
 1154         }
 1155 
 1156         /*
 1157          * A final note: in a low swap situation, we cannot deallocate swap
 1158          * and mark a page dirty here because the caller is likely to mark
 1159          * the page clean when we return, causing the page to possibly revert 
 1160          * to all-zero's later.
 1161          */
 1162 }
 1163 
 1164 /*
 1165  *      swap_pager_putpages: 
 1166  *
 1167  *      Assign swap (if necessary) and initiate I/O on the specified pages.
 1168  *
 1169  *      We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
 1170  *      are automatically converted to SWAP objects.
 1171  *
 1172  *      In a low memory situation we may block in VOP_STRATEGY(), but the new 
 1173  *      vm_page reservation system coupled with properly written VFS devices 
 1174  *      should ensure that no low-memory deadlock occurs.  This is an area
 1175  *      which needs work.
 1176  *
 1177  *      The parent has N vm_object_pip_add() references prior to
 1178  *      calling us and will remove references for rtvals[] that are
 1179  *      not set to VM_PAGER_PEND.  We need to remove the rest on I/O
 1180  *      completion.
 1181  *
 1182  *      The parent has soft-busy'd the pages it passes us and will unbusy
 1183  *      those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
 1184  *      We need to unbusy the rest on I/O completion.
 1185  */
 1186 void
 1187 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
 1188     boolean_t sync, int *rtvals)
 1189 {
 1190         int i;
 1191         int n = 0;
 1192 
 1193         GIANT_REQUIRED;
 1194         if (count && m[0]->object != object) {
 1195                 panic("swap_pager_getpages: object mismatch %p/%p", 
 1196                     object, 
 1197                     m[0]->object
 1198                 );
 1199         }
 1200 
 1201         /*
 1202          * Step 1
 1203          *
 1204          * Turn object into OBJT_SWAP
 1205          * check for bogus sysops
 1206          * force sync if not pageout process
 1207          */
 1208         if (object->type != OBJT_SWAP)
 1209                 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 1210         VM_OBJECT_UNLOCK(object);
 1211 
 1212         if (curproc != pageproc)
 1213                 sync = TRUE;
 1214 
 1215         /*
 1216          * Step 2
 1217          *
 1218          * Update nsw parameters from swap_async_max sysctl values.  
 1219          * Do not let the sysop crash the machine with bogus numbers.
 1220          */
 1221         mtx_lock(&pbuf_mtx);
 1222         if (swap_async_max != nsw_wcount_async_max) {
 1223                 int n;
 1224                 int s;
 1225 
 1226                 /*
 1227                  * limit range
 1228                  */
 1229                 if ((n = swap_async_max) > nswbuf / 2)
 1230                         n = nswbuf / 2;
 1231                 if (n < 1)
 1232                         n = 1;
 1233                 swap_async_max = n;
 1234 
 1235                 /*
 1236                  * Adjust difference ( if possible ).  If the current async
 1237                  * count is too low, we may not be able to make the adjustment
 1238                  * at this time.
 1239                  */
 1240                 s = splvm();
 1241                 n -= nsw_wcount_async_max;
 1242                 if (nsw_wcount_async + n >= 0) {
 1243                         nsw_wcount_async += n;
 1244                         nsw_wcount_async_max += n;
 1245                         wakeup(&nsw_wcount_async);
 1246                 }
 1247                 splx(s);
 1248         }
 1249         mtx_unlock(&pbuf_mtx);
 1250 
 1251         /*
 1252          * Step 3
 1253          *
 1254          * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 1255          * The page is left dirty until the pageout operation completes
 1256          * successfully.
 1257          */
 1258         for (i = 0; i < count; i += n) {
 1259                 int s;
 1260                 int j;
 1261                 struct buf *bp;
 1262                 daddr_t blk;
 1263 
 1264                 /*
 1265                  * Maximum I/O size is limited by a number of factors.
 1266                  */
 1267                 n = min(BLIST_MAX_ALLOC, count - i);
 1268                 n = min(n, nsw_cluster_max);
 1269 
 1270                 s = splvm();
 1271 
 1272                 /*
 1273                  * Get biggest block of swap we can.  If we fail, fall
 1274                  * back and try to allocate a smaller block.  Don't go
 1275                  * overboard trying to allocate space if it would overly
 1276                  * fragment swap.
 1277                  */
 1278                 while (
 1279                     (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
 1280                     n > 4
 1281                 ) {
 1282                         n >>= 1;
 1283                 }
 1284                 if (blk == SWAPBLK_NONE) {
 1285                         for (j = 0; j < n; ++j)
 1286                                 rtvals[i+j] = VM_PAGER_FAIL;
 1287                         splx(s);
 1288                         continue;
 1289                 }
 1290 
 1291                 /*
 1292                  * All I/O parameters have been satisfied, build the I/O
 1293                  * request and assign the swap space.
 1294                  */
 1295                 if (sync == TRUE) {
 1296                         bp = getpbuf(&nsw_wcount_sync);
 1297                 } else {
 1298                         bp = getpbuf(&nsw_wcount_async);
 1299                         bp->b_flags = B_ASYNC;
 1300                 }
 1301                 bp->b_flags |= B_PAGING;
 1302                 bp->b_iocmd = BIO_WRITE;
 1303 
 1304                 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 1305 
 1306                 bp->b_rcred = crhold(thread0.td_ucred);
 1307                 bp->b_wcred = crhold(thread0.td_ucred);
 1308                 bp->b_bcount = PAGE_SIZE * n;
 1309                 bp->b_bufsize = PAGE_SIZE * n;
 1310                 bp->b_blkno = blk;
 1311 
 1312                 VM_OBJECT_LOCK(object);
 1313                 for (j = 0; j < n; ++j) {
 1314                         vm_page_t mreq = m[i+j];
 1315 
 1316                         swp_pager_meta_build(
 1317                             mreq->object, 
 1318                             mreq->pindex,
 1319                             blk + j
 1320                         );
 1321                         vm_page_dirty(mreq);
 1322                         rtvals[i+j] = VM_PAGER_OK;
 1323 
 1324                         vm_page_lock_queues();
 1325                         vm_page_flag_set(mreq, PG_SWAPINPROG);
 1326                         vm_page_unlock_queues();
 1327                         bp->b_pages[j] = mreq;
 1328                 }
 1329                 VM_OBJECT_UNLOCK(object);
 1330                 bp->b_npages = n;
 1331                 /*
 1332                  * Must set dirty range for NFS to work.
 1333                  */
 1334                 bp->b_dirtyoff = 0;
 1335                 bp->b_dirtyend = bp->b_bcount;
 1336 
 1337                 cnt.v_swapout++;
 1338                 cnt.v_swappgsout += bp->b_npages;
 1339 
 1340                 splx(s);
 1341 
 1342                 /*
 1343                  * asynchronous
 1344                  *
 1345                  * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1346                  */
 1347                 if (sync == FALSE) {
 1348                         bp->b_iodone = swp_pager_async_iodone;
 1349                         BUF_KERNPROC(bp);
 1350                         swp_pager_strategy(bp);
 1351 
 1352                         for (j = 0; j < n; ++j)
 1353                                 rtvals[i+j] = VM_PAGER_PEND;
 1354                         /* restart outter loop */
 1355                         continue;
 1356                 }
 1357 
 1358                 /*
 1359                  * synchronous
 1360                  *
 1361                  * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1362                  */
 1363                 bp->b_iodone = bdone;
 1364                 swp_pager_strategy(bp);
 1365 
 1366                 /*
 1367                  * Wait for the sync I/O to complete, then update rtvals.
 1368                  * We just set the rtvals[] to VM_PAGER_PEND so we can call
 1369                  * our async completion routine at the end, thus avoiding a
 1370                  * double-free.
 1371                  */
 1372                 s = splbio();
 1373                 bwait(bp, PVM, "swwrt");
 1374                 for (j = 0; j < n; ++j)
 1375                         rtvals[i+j] = VM_PAGER_PEND;
 1376                 /*
 1377                  * Now that we are through with the bp, we can call the
 1378                  * normal async completion, which frees everything up.
 1379                  */
 1380                 swp_pager_async_iodone(bp);
 1381                 splx(s);
 1382         }
 1383         VM_OBJECT_LOCK(object);
 1384 }
 1385 
 1386 /*
 1387  *      swp_pager_async_iodone:
 1388  *
 1389  *      Completion routine for asynchronous reads and writes from/to swap.
 1390  *      Also called manually by synchronous code to finish up a bp.
 1391  *
 1392  *      For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
 1393  *      the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
 1394  *      unbusy all pages except the 'main' request page.  For WRITE 
 1395  *      operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
 1396  *      because we marked them all VM_PAGER_PEND on return from putpages ).
 1397  *
 1398  *      This routine may not block.
 1399  *      This routine is called at splbio() or better
 1400  *
 1401  *      We up ourselves to splvm() as required for various vm_page related
 1402  *      calls.
 1403  */
 1404 static void
 1405 swp_pager_async_iodone(struct buf *bp)
 1406 {
 1407         int s;
 1408         int i;
 1409         vm_object_t object = NULL;
 1410 
 1411         bp->b_flags |= B_DONE;
 1412 
 1413         /*
 1414          * report error
 1415          */
 1416         if (bp->b_ioflags & BIO_ERROR) {
 1417                 printf(
 1418                     "swap_pager: I/O error - %s failed; blkno %ld,"
 1419                         "size %ld, error %d\n",
 1420                     ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 1421                     (long)bp->b_blkno, 
 1422                     (long)bp->b_bcount,
 1423                     bp->b_error
 1424                 );
 1425         }
 1426 
 1427         /*
 1428          * set object, raise to splvm().
 1429          */
 1430         s = splvm();
 1431 
 1432         /*
 1433          * remove the mapping for kernel virtual
 1434          */
 1435         pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 1436 
 1437         if (bp->b_npages) {
 1438                 object = bp->b_pages[0]->object;
 1439                 VM_OBJECT_LOCK(object);
 1440         }
 1441         vm_page_lock_queues();
 1442         /*
 1443          * cleanup pages.  If an error occurs writing to swap, we are in
 1444          * very serious trouble.  If it happens to be a disk error, though,
 1445          * we may be able to recover by reassigning the swap later on.  So
 1446          * in this case we remove the m->swapblk assignment for the page 
 1447          * but do not free it in the rlist.  The errornous block(s) are thus
 1448          * never reallocated as swap.  Redirty the page and continue.
 1449          */
 1450         for (i = 0; i < bp->b_npages; ++i) {
 1451                 vm_page_t m = bp->b_pages[i];
 1452 
 1453                 vm_page_flag_clear(m, PG_SWAPINPROG);
 1454 
 1455                 if (bp->b_ioflags & BIO_ERROR) {
 1456                         /*
 1457                          * If an error occurs I'd love to throw the swapblk
 1458                          * away without freeing it back to swapspace, so it
 1459                          * can never be used again.  But I can't from an 
 1460                          * interrupt.
 1461                          */
 1462                         if (bp->b_iocmd == BIO_READ) {
 1463                                 /*
 1464                                  * When reading, reqpage needs to stay
 1465                                  * locked for the parent, but all other
 1466                                  * pages can be freed.  We still want to
 1467                                  * wakeup the parent waiting on the page,
 1468                                  * though.  ( also: pg_reqpage can be -1 and 
 1469                                  * not match anything ).
 1470                                  *
 1471                                  * We have to wake specifically requested pages
 1472                                  * up too because we cleared PG_SWAPINPROG and
 1473                                  * someone may be waiting for that.
 1474                                  *
 1475                                  * NOTE: for reads, m->dirty will probably
 1476                                  * be overridden by the original caller of
 1477                                  * getpages so don't play cute tricks here.
 1478                                  *
 1479                                  * XXX IT IS NOT LEGAL TO FREE THE PAGE HERE
 1480                                  * AS THIS MESSES WITH object->memq, and it is
 1481                                  * not legal to mess with object->memq from an
 1482                                  * interrupt.
 1483                                  */
 1484                                 m->valid = 0;
 1485                                 if (i != bp->b_pager.pg_reqpage)
 1486                                         vm_page_free(m);
 1487                                 else
 1488                                         vm_page_flash(m);
 1489                                 /*
 1490                                  * If i == bp->b_pager.pg_reqpage, do not wake 
 1491                                  * the page up.  The caller needs to.
 1492                                  */
 1493                         } else {
 1494                                 /*
 1495                                  * If a write error occurs, reactivate page
 1496                                  * so it doesn't clog the inactive list,
 1497                                  * then finish the I/O.
 1498                                  */
 1499                                 vm_page_dirty(m);
 1500                                 vm_page_activate(m);
 1501                                 vm_page_io_finish(m);
 1502                         }
 1503                 } else if (bp->b_iocmd == BIO_READ) {
 1504                         /*
 1505                          * For read success, clear dirty bits.  Nobody should
 1506                          * have this page mapped but don't take any chances,
 1507                          * make sure the pmap modify bits are also cleared.
 1508                          *
 1509                          * NOTE: for reads, m->dirty will probably be 
 1510                          * overridden by the original caller of getpages so
 1511                          * we cannot set them in order to free the underlying
 1512                          * swap in a low-swap situation.  I don't think we'd
 1513                          * want to do that anyway, but it was an optimization
 1514                          * that existed in the old swapper for a time before
 1515                          * it got ripped out due to precisely this problem.
 1516                          *
 1517                          * If not the requested page then deactivate it.
 1518                          *
 1519                          * Note that the requested page, reqpage, is left
 1520                          * busied, but we still have to wake it up.  The
 1521                          * other pages are released (unbusied) by 
 1522                          * vm_page_wakeup().  We do not set reqpage's
 1523                          * valid bits here, it is up to the caller.
 1524                          */
 1525                         pmap_clear_modify(m);
 1526                         m->valid = VM_PAGE_BITS_ALL;
 1527                         vm_page_undirty(m);
 1528 
 1529                         /*
 1530                          * We have to wake specifically requested pages
 1531                          * up too because we cleared PG_SWAPINPROG and
 1532                          * could be waiting for it in getpages.  However,
 1533                          * be sure to not unbusy getpages specifically
 1534                          * requested page - getpages expects it to be 
 1535                          * left busy.
 1536                          */
 1537                         if (i != bp->b_pager.pg_reqpage) {
 1538                                 vm_page_deactivate(m);
 1539                                 vm_page_wakeup(m);
 1540                         } else {
 1541                                 vm_page_flash(m);
 1542                         }
 1543                 } else {
 1544                         /*
 1545                          * For write success, clear the modify and dirty 
 1546                          * status, then finish the I/O ( which decrements the 
 1547                          * busy count and possibly wakes waiter's up ).
 1548                          */
 1549                         pmap_clear_modify(m);
 1550                         vm_page_undirty(m);
 1551                         vm_page_io_finish(m);
 1552                         if (vm_page_count_severe())
 1553                                 vm_page_try_to_cache(m);
 1554                 }
 1555         }
 1556         vm_page_unlock_queues();
 1557 
 1558         /*
 1559          * adjust pip.  NOTE: the original parent may still have its own
 1560          * pip refs on the object.
 1561          */
 1562         if (object != NULL) {
 1563                 vm_object_pip_wakeupn(object, bp->b_npages);
 1564                 VM_OBJECT_UNLOCK(object);
 1565         }
 1566 
 1567         /*
 1568          * release the physical I/O buffer
 1569          */
 1570         relpbuf(
 1571             bp, 
 1572             ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 
 1573                 ((bp->b_flags & B_ASYNC) ? 
 1574                     &nsw_wcount_async : 
 1575                     &nsw_wcount_sync
 1576                 )
 1577             )
 1578         );
 1579         splx(s);
 1580 }
 1581 
 1582 /*
 1583  *      swap_pager_isswapped:
 1584  *
 1585  *      Return 1 if at least one page in the given object is paged
 1586  *      out to the given swap device.
 1587  *
 1588  *      This routine may not block.
 1589  */
 1590 int
 1591 swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
 1592 {
 1593         daddr_t index = 0;
 1594         int bcount;
 1595         int i;
 1596 
 1597         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1598         if (object->type != OBJT_SWAP)
 1599                 return (0);
 1600 
 1601         for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
 1602                 struct swblock *swap;
 1603 
 1604                 mtx_lock(&swhash_mtx);
 1605                 if ((swap = *swp_pager_hash(object, index)) != NULL) {
 1606                         for (i = 0; i < SWAP_META_PAGES; ++i) {
 1607                                 daddr_t v = swap->swb_pages[i];
 1608                                 if (v == SWAPBLK_NONE)
 1609                                         continue;
 1610                                 if (swp_pager_find_dev(v) == sp) {
 1611                                         mtx_unlock(&swhash_mtx);
 1612                                         return 1;
 1613                                 }
 1614                         }
 1615                 }
 1616                 mtx_unlock(&swhash_mtx);
 1617                 index += SWAP_META_PAGES;
 1618                 if (index > 0x20000000)
 1619                         panic("swap_pager_isswapped: failed to locate all swap meta blocks");
 1620         }
 1621         return 0;
 1622 }
 1623 
 1624 /*
 1625  * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
 1626  *
 1627  *      This routine dissociates the page at the given index within a
 1628  *      swap block from its backing store, paging it in if necessary.
 1629  *      If the page is paged in, it is placed in the inactive queue,
 1630  *      since it had its backing store ripped out from under it.
 1631  *      We also attempt to swap in all other pages in the swap block,
 1632  *      we only guarantee that the one at the specified index is
 1633  *      paged in.
 1634  *
 1635  *      XXX - The code to page the whole block in doesn't work, so we
 1636  *            revert to the one-by-one behavior for now.  Sigh.
 1637  */
 1638 static __inline void
 1639 swp_pager_force_pagein(struct swblock *swap, int idx)
 1640 {
 1641         vm_object_t object;
 1642         vm_page_t m;
 1643         vm_pindex_t pindex;
 1644 
 1645         object = swap->swb_object;
 1646         pindex = swap->swb_index;
 1647         mtx_unlock(&swhash_mtx);
 1648 
 1649         VM_OBJECT_LOCK(object);
 1650         vm_object_pip_add(object, 1);
 1651         m = vm_page_grab(object, pindex + idx, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
 1652         if (m->valid == VM_PAGE_BITS_ALL) {
 1653                 vm_object_pip_subtract(object, 1);
 1654                 vm_page_lock_queues();
 1655                 vm_page_activate(m);
 1656                 vm_page_dirty(m);
 1657                 vm_page_wakeup(m);
 1658                 vm_page_unlock_queues();
 1659                 vm_pager_page_unswapped(m);
 1660                 VM_OBJECT_UNLOCK(object);
 1661                 return;
 1662         }
 1663 
 1664         if (swap_pager_getpages(object, &m, 1, 0) !=
 1665             VM_PAGER_OK)
 1666                 panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
 1667         vm_object_pip_subtract(object, 1);
 1668         vm_page_lock_queues();
 1669         vm_page_dirty(m);
 1670         vm_page_dontneed(m);
 1671         vm_page_wakeup(m);
 1672         vm_page_unlock_queues();
 1673         vm_pager_page_unswapped(m);
 1674         VM_OBJECT_UNLOCK(object);
 1675 }
 1676 
 1677 
 1678 /*
 1679  *      swap_pager_swapoff:
 1680  *
 1681  *      Page in all of the pages that have been paged out to the
 1682  *      given device.  The corresponding blocks in the bitmap must be
 1683  *      marked as allocated and the device must be flagged SW_CLOSING.
 1684  *      There may be no processes swapped out to the device.
 1685  *
 1686  *      The sw_used parameter points to the field in the swdev structure
 1687  *      that contains a count of the number of blocks still allocated
 1688  *      on the device.  If we encounter objects with a nonzero pip count
 1689  *      in our scan, we use this number to determine if we're really done.
 1690  *
 1691  *      This routine may block.
 1692  */
 1693 static void
 1694 swap_pager_swapoff(struct swdevt *sp, int *sw_used)
 1695 {
 1696         struct swblock **pswap;
 1697         struct swblock *swap;
 1698         vm_object_t waitobj;
 1699         daddr_t v;
 1700         int i, j;
 1701 
 1702         GIANT_REQUIRED;
 1703 
 1704 full_rescan:
 1705         waitobj = NULL;
 1706         for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 1707 restart:
 1708                 pswap = &swhash[i];
 1709                 mtx_lock(&swhash_mtx);
 1710                 while ((swap = *pswap) != NULL) {
 1711                         for (j = 0; j < SWAP_META_PAGES; ++j) {
 1712                                 v = swap->swb_pages[j];
 1713                                 if (v != SWAPBLK_NONE &&
 1714                                     swp_pager_find_dev(v) == sp)
 1715                                         break;
 1716                         }
 1717                         if (j < SWAP_META_PAGES) {
 1718                                 swp_pager_force_pagein(swap, j);
 1719                                 goto restart;
 1720                         } else if (swap->swb_object->paging_in_progress) {
 1721                                 if (!waitobj)
 1722                                         waitobj = swap->swb_object;
 1723                         }
 1724                         pswap = &swap->swb_hnext;
 1725                 }
 1726                 mtx_unlock(&swhash_mtx);
 1727         }
 1728         if (waitobj && *sw_used) {
 1729             /*
 1730              * We wait on an arbitrary object to clock our rescans
 1731              * to the rate of paging completion.
 1732              */
 1733             VM_OBJECT_LOCK(waitobj);
 1734             vm_object_pip_wait(waitobj, "swpoff");
 1735             VM_OBJECT_UNLOCK(waitobj);
 1736             goto full_rescan;
 1737         }
 1738         if (*sw_used)
 1739             panic("swapoff: failed to locate %d swap blocks", *sw_used);
 1740 }
 1741 
 1742 /************************************************************************
 1743  *                              SWAP META DATA                          *
 1744  ************************************************************************
 1745  *
 1746  *      These routines manipulate the swap metadata stored in the 
 1747  *      OBJT_SWAP object.  All swp_*() routines must be called at
 1748  *      splvm() because swap can be freed up by the low level vm_page
 1749  *      code which might be called from interrupts beyond what splbio() covers.
 1750  *
 1751  *      Swap metadata is implemented with a global hash and not directly
 1752  *      linked into the object.  Instead the object simply contains
 1753  *      appropriate tracking counters.
 1754  */
 1755 
 1756 /*
 1757  * SWP_PAGER_META_BUILD() -     add swap block to swap meta data for object
 1758  *
 1759  *      We first convert the object to a swap object if it is a default
 1760  *      object.
 1761  *
 1762  *      The specified swapblk is added to the object's swap metadata.  If
 1763  *      the swapblk is not valid, it is freed instead.  Any previously
 1764  *      assigned swapblk is freed.
 1765  *
 1766  *      This routine must be called at splvm(), except when used to convert
 1767  *      an OBJT_DEFAULT object into an OBJT_SWAP object.
 1768  */
 1769 static void
 1770 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 1771 {
 1772         struct swblock *swap;
 1773         struct swblock **pswap;
 1774         int idx;
 1775 
 1776         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1777         /*
 1778          * Convert default object to swap object if necessary
 1779          */
 1780         if (object->type != OBJT_SWAP) {
 1781                 object->type = OBJT_SWAP;
 1782                 object->un_pager.swp.swp_bcount = 0;
 1783 
 1784                 if (object->handle != NULL) {
 1785                         mtx_lock(&sw_alloc_mtx);
 1786                         TAILQ_INSERT_TAIL(
 1787                             NOBJLIST(object->handle),
 1788                             object, 
 1789                             pager_object_list
 1790                         );
 1791                         mtx_unlock(&sw_alloc_mtx);
 1792                 }
 1793         }
 1794         
 1795         /*
 1796          * Locate hash entry.  If not found create, but if we aren't adding
 1797          * anything just return.  If we run out of space in the map we wait
 1798          * and, since the hash table may have changed, retry.
 1799          */
 1800 retry:
 1801         mtx_lock(&swhash_mtx);
 1802         pswap = swp_pager_hash(object, pindex);
 1803 
 1804         if ((swap = *pswap) == NULL) {
 1805                 int i;
 1806 
 1807                 if (swapblk == SWAPBLK_NONE)
 1808                         goto done;
 1809 
 1810                 swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
 1811                 if (swap == NULL) {
 1812                         mtx_unlock(&swhash_mtx);
 1813                         VM_OBJECT_UNLOCK(object);
 1814                         VM_WAIT;
 1815                         VM_OBJECT_LOCK(object);
 1816                         goto retry;
 1817                 }
 1818 
 1819                 swap->swb_hnext = NULL;
 1820                 swap->swb_object = object;
 1821                 swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
 1822                 swap->swb_count = 0;
 1823 
 1824                 ++object->un_pager.swp.swp_bcount;
 1825 
 1826                 for (i = 0; i < SWAP_META_PAGES; ++i)
 1827                         swap->swb_pages[i] = SWAPBLK_NONE;
 1828         }
 1829 
 1830         /*
 1831          * Delete prior contents of metadata
 1832          */
 1833         idx = pindex & SWAP_META_MASK;
 1834 
 1835         if (swap->swb_pages[idx] != SWAPBLK_NONE) {
 1836                 swp_pager_freeswapspace(swap->swb_pages[idx], 1);
 1837                 --swap->swb_count;
 1838         }
 1839 
 1840         /*
 1841          * Enter block into metadata
 1842          */
 1843         swap->swb_pages[idx] = swapblk;
 1844         if (swapblk != SWAPBLK_NONE)
 1845                 ++swap->swb_count;
 1846 done:
 1847         mtx_unlock(&swhash_mtx);
 1848 }
 1849 
 1850 /*
 1851  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
 1852  *
 1853  *      The requested range of blocks is freed, with any associated swap 
 1854  *      returned to the swap bitmap.
 1855  *
 1856  *      This routine will free swap metadata structures as they are cleaned 
 1857  *      out.  This routine does *NOT* operate on swap metadata associated
 1858  *      with resident pages.
 1859  *
 1860  *      This routine must be called at splvm()
 1861  */
 1862 static void
 1863 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 1864 {
 1865 
 1866         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1867         if (object->type != OBJT_SWAP)
 1868                 return;
 1869 
 1870         while (count > 0) {
 1871                 struct swblock **pswap;
 1872                 struct swblock *swap;
 1873 
 1874                 mtx_lock(&swhash_mtx);
 1875                 pswap = swp_pager_hash(object, index);
 1876 
 1877                 if ((swap = *pswap) != NULL) {
 1878                         daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 1879 
 1880                         if (v != SWAPBLK_NONE) {
 1881                                 swp_pager_freeswapspace(v, 1);
 1882                                 swap->swb_pages[index & SWAP_META_MASK] =
 1883                                         SWAPBLK_NONE;
 1884                                 if (--swap->swb_count == 0) {
 1885                                         *pswap = swap->swb_hnext;
 1886                                         uma_zfree(swap_zone, swap);
 1887                                         --object->un_pager.swp.swp_bcount;
 1888                                 }
 1889                         }
 1890                         --count;
 1891                         ++index;
 1892                 } else {
 1893                         int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
 1894                         count -= n;
 1895                         index += n;
 1896                 }
 1897                 mtx_unlock(&swhash_mtx);
 1898         }
 1899 }
 1900 
 1901 /*
 1902  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
 1903  *
 1904  *      This routine locates and destroys all swap metadata associated with
 1905  *      an object.
 1906  *
 1907  *      This routine must be called at splvm()
 1908  */
 1909 static void
 1910 swp_pager_meta_free_all(vm_object_t object)
 1911 {
 1912         daddr_t index = 0;
 1913 
 1914         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1915         if (object->type != OBJT_SWAP)
 1916                 return;
 1917 
 1918         while (object->un_pager.swp.swp_bcount) {
 1919                 struct swblock **pswap;
 1920                 struct swblock *swap;
 1921 
 1922                 mtx_lock(&swhash_mtx);
 1923                 pswap = swp_pager_hash(object, index);
 1924                 if ((swap = *pswap) != NULL) {
 1925                         int i;
 1926 
 1927                         for (i = 0; i < SWAP_META_PAGES; ++i) {
 1928                                 daddr_t v = swap->swb_pages[i];
 1929                                 if (v != SWAPBLK_NONE) {
 1930                                         --swap->swb_count;
 1931                                         swp_pager_freeswapspace(v, 1);
 1932                                 }
 1933                         }
 1934                         if (swap->swb_count != 0)
 1935                                 panic("swap_pager_meta_free_all: swb_count != 0");
 1936                         *pswap = swap->swb_hnext;
 1937                         uma_zfree(swap_zone, swap);
 1938                         --object->un_pager.swp.swp_bcount;
 1939                 }
 1940                 mtx_unlock(&swhash_mtx);
 1941                 index += SWAP_META_PAGES;
 1942                 if (index > 0x20000000)
 1943                         panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
 1944         }
 1945 }
 1946 
 1947 /*
 1948  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
 1949  *
 1950  *      This routine is capable of looking up, popping, or freeing
 1951  *      swapblk assignments in the swap meta data or in the vm_page_t.
 1952  *      The routine typically returns the swapblk being looked-up, or popped,
 1953  *      or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
 1954  *      was invalid.  This routine will automatically free any invalid 
 1955  *      meta-data swapblks.
 1956  *
 1957  *      It is not possible to store invalid swapblks in the swap meta data
 1958  *      (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
 1959  *
 1960  *      When acting on a busy resident page and paging is in progress, we 
 1961  *      have to wait until paging is complete but otherwise can act on the 
 1962  *      busy page.
 1963  *
 1964  *      This routine must be called at splvm().
 1965  *
 1966  *      SWM_FREE        remove and free swap block from metadata
 1967  *      SWM_POP         remove from meta data but do not free.. pop it out
 1968  */
 1969 static daddr_t
 1970 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 1971 {
 1972         struct swblock **pswap;
 1973         struct swblock *swap;
 1974         daddr_t r1;
 1975         int idx;
 1976 
 1977         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1978         /*
 1979          * The meta data only exists of the object is OBJT_SWAP 
 1980          * and even then might not be allocated yet.
 1981          */
 1982         if (object->type != OBJT_SWAP)
 1983                 return (SWAPBLK_NONE);
 1984 
 1985         r1 = SWAPBLK_NONE;
 1986         mtx_lock(&swhash_mtx);
 1987         pswap = swp_pager_hash(object, pindex);
 1988 
 1989         if ((swap = *pswap) != NULL) {
 1990                 idx = pindex & SWAP_META_MASK;
 1991                 r1 = swap->swb_pages[idx];
 1992 
 1993                 if (r1 != SWAPBLK_NONE) {
 1994                         if (flags & SWM_FREE) {
 1995                                 swp_pager_freeswapspace(r1, 1);
 1996                                 r1 = SWAPBLK_NONE;
 1997                         }
 1998                         if (flags & (SWM_FREE|SWM_POP)) {
 1999                                 swap->swb_pages[idx] = SWAPBLK_NONE;
 2000                                 if (--swap->swb_count == 0) {
 2001                                         *pswap = swap->swb_hnext;
 2002                                         uma_zfree(swap_zone, swap);
 2003                                         --object->un_pager.swp.swp_bcount;
 2004                                 }
 2005                         } 
 2006                 }
 2007         }
 2008         mtx_unlock(&swhash_mtx);
 2009         return (r1);
 2010 }
 2011 
 2012 /*
 2013  * System call swapon(name) enables swapping on device name,
 2014  * which must be in the swdevsw.  Return EBUSY
 2015  * if already swapping on this device.
 2016  */
 2017 #ifndef _SYS_SYSPROTO_H_
 2018 struct swapon_args {
 2019         char *name;
 2020 };
 2021 #endif
 2022 
 2023 /* 
 2024  * MPSAFE
 2025  */
 2026 /* ARGSUSED */
 2027 int
 2028 swapon(struct thread *td, struct swapon_args *uap)
 2029 {
 2030         struct vattr attr;
 2031         struct vnode *vp;
 2032         struct nameidata nd;
 2033         int error;
 2034 
 2035         mtx_lock(&Giant);
 2036         error = suser(td);
 2037         if (error)
 2038                 goto done2;
 2039 
 2040         while (swdev_syscall_active)
 2041             tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 2042         swdev_syscall_active = 1;
 2043 
 2044         /*
 2045          * Swap metadata may not fit in the KVM if we have physical
 2046          * memory of >1GB.
 2047          */
 2048         if (swap_zone == NULL) {
 2049                 error = ENOMEM;
 2050                 goto done;
 2051         }
 2052 
 2053         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
 2054         error = namei(&nd);
 2055         if (error)
 2056                 goto done;
 2057 
 2058         NDFREE(&nd, NDF_ONLY_PNBUF);
 2059         vp = nd.ni_vp;
 2060 
 2061         if (vn_isdisk(vp, &error)) {
 2062                 error = swapongeom(td, vp);
 2063         } else if (vp->v_type == VREG &&
 2064             (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 2065             (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) {
 2066                 /*
 2067                  * Allow direct swapping to NFS regular files in the same
 2068                  * way that nfs_mountroot() sets up diskless swapping.
 2069                  */
 2070                 error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 2071         }
 2072 
 2073         if (error)
 2074                 vrele(vp);
 2075 done:
 2076         swdev_syscall_active = 0;
 2077         wakeup_one(&swdev_syscall_active);
 2078 done2:
 2079         mtx_unlock(&Giant);
 2080         return (error);
 2081 }
 2082 
 2083 static void
 2084 swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
 2085 {
 2086         struct swdevt *sp, *tsp;
 2087         swblk_t dvbase;
 2088         u_long mblocks;
 2089 
 2090         /*
 2091          * If we go beyond this, we get overflows in the radix
 2092          * tree bitmap code.
 2093          */
 2094         mblocks = 0x40000000 / BLIST_META_RADIX;
 2095         if (nblks > mblocks) {
 2096                 printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n",
 2097                         mblocks);
 2098                 nblks = mblocks;
 2099         }
 2100         /*
 2101          * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 2102          * First chop nblks off to page-align it, then convert.
 2103          * 
 2104          * sw->sw_nblks is in page-sized chunks now too.
 2105          */
 2106         nblks &= ~(ctodb(1) - 1);
 2107         nblks = dbtoc(nblks);
 2108 
 2109         sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 2110         sp->sw_vp = vp;
 2111         sp->sw_id = id;
 2112         sp->sw_dev = dev;
 2113         sp->sw_flags = 0;
 2114         sp->sw_nblks = nblks;
 2115         sp->sw_used = 0;
 2116         sp->sw_strategy = strategy;
 2117         sp->sw_close = close;
 2118 
 2119         sp->sw_blist = blist_create(nblks);
 2120         /*
 2121          * Do not free the first two block in order to avoid overwriting
 2122          * any bsd label at the front of the partition
 2123          */
 2124         blist_free(sp->sw_blist, 2, nblks - 2);
 2125 
 2126         dvbase = 0;
 2127         mtx_lock(&sw_dev_mtx);
 2128         TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 2129                 if (tsp->sw_end >= dvbase) {
 2130                         /*
 2131                          * We put one uncovered page between the devices
 2132                          * in order to definitively prevent any cross-device
 2133                          * I/O requests
 2134                          */
 2135                         dvbase = tsp->sw_end + 1;
 2136                 }
 2137         }
 2138         sp->sw_first = dvbase;
 2139         sp->sw_end = dvbase + nblks;
 2140         TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 2141         nswapdev++;
 2142         swap_pager_avail += nblks;
 2143         swp_sizecheck();
 2144         mtx_unlock(&sw_dev_mtx);
 2145 }
 2146 
 2147 /*
 2148  * SYSCALL: swapoff(devname)
 2149  *
 2150  * Disable swapping on the given device.
 2151  *
 2152  * XXX: Badly designed system call: it should use a device index
 2153  * rather than filename as specification.  We keep sw_vp around
 2154  * only to make this work.
 2155  */
 2156 #ifndef _SYS_SYSPROTO_H_
 2157 struct swapoff_args {
 2158         char *name;
 2159 };
 2160 #endif
 2161 
 2162 /*
 2163  * MPSAFE
 2164  */
 2165 /* ARGSUSED */
 2166 int
 2167 swapoff(struct thread *td, struct swapoff_args *uap)
 2168 {
 2169         struct vnode *vp;
 2170         struct nameidata nd;
 2171         struct swdevt *sp;
 2172         u_long nblks, dvbase;
 2173         int error;
 2174 
 2175         mtx_lock(&Giant);
 2176 
 2177         error = suser(td);
 2178         if (error)
 2179                 goto done2;
 2180 
 2181         while (swdev_syscall_active)
 2182             tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 2183         swdev_syscall_active = 1;
 2184 
 2185         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
 2186         error = namei(&nd);
 2187         if (error)
 2188                 goto done;
 2189         NDFREE(&nd, NDF_ONLY_PNBUF);
 2190         vp = nd.ni_vp;
 2191 
 2192         mtx_lock(&sw_dev_mtx);
 2193         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2194                 if (sp->sw_vp == vp)
 2195                         goto found;
 2196         }
 2197         mtx_unlock(&sw_dev_mtx);
 2198         error = EINVAL;
 2199         goto done;
 2200 found:
 2201         mtx_unlock(&sw_dev_mtx);
 2202 #ifdef MAC
 2203         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2204         error = mac_check_system_swapoff(td->td_ucred, vp);
 2205         (void) VOP_UNLOCK(vp, 0, td);
 2206         if (error != 0)
 2207                 goto done;
 2208 #endif
 2209         
 2210         nblks = sp->sw_nblks;
 2211 
 2212         /*
 2213          * We can turn off this swap device safely only if the
 2214          * available virtual memory in the system will fit the amount
 2215          * of data we will have to page back in, plus an epsilon so
 2216          * the system doesn't become critically low on swap space.
 2217          */
 2218         if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
 2219             nblks + nswap_lowat) {
 2220                 error = ENOMEM;
 2221                 goto done;
 2222         }
 2223 
 2224         /*
 2225          * Prevent further allocations on this device.
 2226          */
 2227         mtx_lock(&sw_dev_mtx);
 2228         sp->sw_flags |= SW_CLOSING;
 2229         for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
 2230                 swap_pager_avail -= blist_fill(sp->sw_blist,
 2231                      dvbase, dmmax);
 2232         }
 2233         mtx_unlock(&sw_dev_mtx);
 2234 
 2235         /*
 2236          * Page in the contents of the device and close it.
 2237          */
 2238 #ifndef NO_SWAPPING
 2239         vm_proc_swapin_all(sp);
 2240 #endif /* !NO_SWAPPING */
 2241         swap_pager_swapoff(sp, &sp->sw_used);
 2242 
 2243         sp->sw_close(td, sp);
 2244         sp->sw_id = NULL;
 2245         mtx_lock(&sw_dev_mtx);
 2246         TAILQ_REMOVE(&swtailq, sp, sw_list);
 2247         nswapdev--;
 2248         if (nswapdev == 0) {
 2249                 swap_pager_full = 2;
 2250                 swap_pager_almost_full = 1;
 2251         }
 2252         if (swdevhd == sp)
 2253                 swdevhd = NULL;
 2254         mtx_unlock(&sw_dev_mtx);
 2255         blist_destroy(sp->sw_blist);
 2256         free(sp, M_VMPGDATA);
 2257 
 2258 done:
 2259         swdev_syscall_active = 0;
 2260         wakeup_one(&swdev_syscall_active);
 2261 done2:
 2262         mtx_unlock(&Giant);
 2263         return (error);
 2264 }
 2265 
 2266 void
 2267 swap_pager_status(int *total, int *used)
 2268 {
 2269         struct swdevt *sp;
 2270 
 2271         *total = 0;
 2272         *used = 0;
 2273         mtx_lock(&sw_dev_mtx);
 2274         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2275                 *total += sp->sw_nblks;
 2276                 *used += sp->sw_used;
 2277         }
 2278         mtx_unlock(&sw_dev_mtx);
 2279 }
 2280 
 2281 static int
 2282 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 2283 {
 2284         int     *name = (int *)arg1;
 2285         int     error, n;
 2286         struct xswdev xs;
 2287         struct swdevt *sp;
 2288 
 2289         if (arg2 != 1) /* name length */
 2290                 return (EINVAL);
 2291 
 2292         n = 0;
 2293         mtx_lock(&sw_dev_mtx);
 2294         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2295                 if (n == *name) {
 2296                         mtx_unlock(&sw_dev_mtx);
 2297                         xs.xsw_version = XSWDEV_VERSION;
 2298                         xs.xsw_dev = sp->sw_dev;
 2299                         xs.xsw_flags = sp->sw_flags;
 2300                         xs.xsw_nblks = sp->sw_nblks;
 2301                         xs.xsw_used = sp->sw_used;
 2302 
 2303                         error = SYSCTL_OUT(req, &xs, sizeof(xs));
 2304                         return (error);
 2305                 }
 2306                 n++;
 2307         }
 2308         mtx_unlock(&sw_dev_mtx);
 2309         return (ENOENT);
 2310 }
 2311 
 2312 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
 2313     "Number of swap devices");
 2314 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
 2315     "Swap statistics by device");
 2316 
 2317 /*
 2318  * vmspace_swap_count() - count the approximate swap useage in pages for a
 2319  *                        vmspace.
 2320  *
 2321  *      The map must be locked.
 2322  *
 2323  *      Swap useage is determined by taking the proportional swap used by
 2324  *      VM objects backing the VM map.  To make up for fractional losses,
 2325  *      if the VM object has any swap use at all the associated map entries
 2326  *      count for at least 1 swap page.
 2327  */
 2328 int
 2329 vmspace_swap_count(struct vmspace *vmspace)
 2330 {
 2331         vm_map_t map = &vmspace->vm_map;
 2332         vm_map_entry_t cur;
 2333         int count = 0;
 2334 
 2335         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 2336                 vm_object_t object;
 2337 
 2338                 if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 2339                     (object = cur->object.vm_object) != NULL) {
 2340                         VM_OBJECT_LOCK(object);
 2341                         if (object->type == OBJT_SWAP &&
 2342                             object->un_pager.swp.swp_bcount != 0) {
 2343                                 int n = (cur->end - cur->start) / PAGE_SIZE;
 2344 
 2345                                 count += object->un_pager.swp.swp_bcount *
 2346                                     SWAP_META_PAGES * n / object->size + 1;
 2347                         }
 2348                         VM_OBJECT_UNLOCK(object);
 2349                 }
 2350         }
 2351         return (count);
 2352 }
 2353 
 2354 /*
 2355  * GEOM backend
 2356  *
 2357  * Swapping onto disk devices.
 2358  *
 2359  */
 2360 
 2361 static g_orphan_t swapgeom_orphan;
 2362 
 2363 static struct g_class g_swap_class = {
 2364         .name = "SWAP",
 2365         .version = G_VERSION,
 2366         .orphan = swapgeom_orphan,
 2367 };
 2368 
 2369 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 2370 
 2371 
 2372 static void
 2373 swapgeom_done(struct bio *bp2)
 2374 {
 2375         struct buf *bp;
 2376 
 2377         bp = bp2->bio_caller2;
 2378         if (bp2->bio_error)
 2379                 bp->b_ioflags |= BIO_ERROR;
 2380         bufdone(bp);
 2381         g_destroy_bio(bp2);
 2382 }
 2383 
 2384 static void
 2385 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 2386 {
 2387         struct bio *bio;
 2388         struct g_consumer *cp;
 2389 
 2390         cp = sp->sw_id;
 2391         if (cp == NULL) {
 2392                 bp->b_error = ENXIO;
 2393                 bp->b_ioflags |= BIO_ERROR;
 2394                 bufdone(bp);
 2395                 return;
 2396         }
 2397         bio = g_clone_bio(&bp->b_io);
 2398         if (bio == NULL) {
 2399                 /*
 2400                  * XXX: This is better than panicing, but not much better.
 2401                  * XXX: Somehow this should be retried.  A more generic
 2402                  * XXX: implementation of ENOMEM in geom may be able to cope.
 2403                  */
 2404                 bp->b_error = ENOMEM;
 2405                 bp->b_ioflags |= BIO_ERROR;
 2406                 bufdone(bp);
 2407                 return;
 2408         }
 2409         bio->bio_caller2 = bp;
 2410         bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 2411         bio->bio_length = bp->b_bcount;
 2412         bio->bio_done = swapgeom_done;
 2413         g_io_request(bio, cp);
 2414         return;
 2415 }
 2416 
 2417 static void
 2418 swapgeom_orphan(struct g_consumer *cp)
 2419 {
 2420         struct swdevt *sp;
 2421 
 2422         mtx_lock(&sw_dev_mtx);
 2423         TAILQ_FOREACH(sp, &swtailq, sw_list)
 2424                 if (sp->sw_id == cp)
 2425                         sp->sw_id = NULL;
 2426         mtx_unlock(&sw_dev_mtx);
 2427 }
 2428 
 2429 static void
 2430 swapgeom_close_ev(void *arg, int flags)
 2431 {
 2432         struct g_consumer *cp;
 2433 
 2434         cp = arg;
 2435         g_access(cp, -1, -1, 0);
 2436         g_detach(cp);
 2437         g_destroy_consumer(cp);
 2438 }
 2439 
 2440 static void
 2441 swapgeom_close(struct thread *td, struct swdevt *sw)
 2442 {
 2443 
 2444         /* XXX: direct call when Giant untangled */
 2445         g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
 2446 }
 2447 
 2448 
 2449 struct swh0h0 {
 2450         struct cdev *dev;
 2451         struct vnode *vp;
 2452         int     error;
 2453 };
 2454 
 2455 static void
 2456 swapongeom_ev(void *arg, int flags)
 2457 {
 2458         struct swh0h0 *swh;
 2459         struct g_provider *pp;
 2460         struct g_consumer *cp;
 2461         static struct g_geom *gp;
 2462         struct swdevt *sp;
 2463         u_long nblks;
 2464         int error;
 2465 
 2466         swh = arg;
 2467         swh->error = 0;
 2468         pp = g_dev_getprovider(swh->dev);
 2469         if (pp == NULL) {
 2470                 swh->error = ENODEV;
 2471                 return;
 2472         }
 2473         mtx_lock(&sw_dev_mtx);
 2474         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2475                 cp = sp->sw_id;
 2476                 if (cp != NULL && cp->provider == pp) {
 2477                         mtx_unlock(&sw_dev_mtx);
 2478                         swh->error = EBUSY;
 2479                         return;
 2480                 }
 2481         }
 2482         mtx_unlock(&sw_dev_mtx);
 2483         if (gp == NULL)
 2484                 gp = g_new_geomf(&g_swap_class, "swap", NULL);
 2485         cp = g_new_consumer(gp);
 2486         g_attach(cp, pp);
 2487         /*
 2488          * XXX: Everytime you think you can improve the margin for
 2489          * footshooting, somebody depends on the ability to do so:
 2490          * savecore(8) wants to write to our swapdev so we cannot
 2491          * set an exclusive count :-(
 2492          */
 2493         error = g_access(cp, 1, 1, 0);
 2494         if (error) {
 2495                 g_detach(cp);
 2496                 g_destroy_consumer(cp);
 2497                 swh->error = error;
 2498                 return;
 2499         }
 2500         nblks = pp->mediasize / DEV_BSIZE;
 2501         swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
 2502             swapgeom_close, dev2udev(swh->dev));
 2503         swh->error = 0;
 2504         return;
 2505 }
 2506 
 2507 static int
 2508 swapongeom(struct thread *td, struct vnode *vp)
 2509 {
 2510         int error;
 2511         struct swh0h0 swh;
 2512 
 2513         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2514 
 2515         swh.dev = vp->v_rdev;
 2516         swh.vp = vp;
 2517         swh.error = 0;
 2518         /* XXX: direct call when Giant untangled */
 2519         error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
 2520         if (!error)
 2521                 error = swh.error;
 2522         VOP_UNLOCK(vp, 0, td);
 2523         return (error);
 2524 }
 2525 
 2526 /*
 2527  * VNODE backend
 2528  *
 2529  * This is used mainly for network filesystem (read: probably only tested
 2530  * with NFS) swapfiles.
 2531  *
 2532  */
 2533 
 2534 static void
 2535 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 2536 {
 2537         int s;
 2538         struct vnode *vp, *vp2;
 2539 
 2540         bp->b_dev = NULL;
 2541         bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 2542 
 2543         vp2 = sp->sw_id;
 2544         vhold(vp2);
 2545         s = splvm();
 2546         if (bp->b_iocmd == BIO_WRITE) {
 2547                 vp = bp->b_vp;
 2548                 if (vp) {
 2549                         VI_LOCK(vp);
 2550                         vp->v_numoutput--;
 2551                         if ((vp->v_iflag & VI_BWAIT) && vp->v_numoutput <= 0) {
 2552                                 vp->v_iflag &= ~VI_BWAIT;
 2553                                 wakeup(&vp->v_numoutput);
 2554                         }
 2555                         VI_UNLOCK(vp);
 2556                 }
 2557                 VI_LOCK(vp2);
 2558                 vp2->v_numoutput++;
 2559                 VI_UNLOCK(vp2);
 2560         }
 2561         bp->b_vp = vp2;
 2562         splx(s);
 2563         bp->b_iooffset = dbtob(bp->b_blkno);
 2564         VOP_STRATEGY(vp2, bp);
 2565         return;
 2566 }
 2567 
 2568 static void
 2569 swapdev_close(struct thread *td, struct swdevt *sp)
 2570 {
 2571 
 2572         VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 2573         vrele(sp->sw_vp);
 2574 }
 2575 
 2576 
 2577 static int
 2578 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 2579 {
 2580         struct swdevt *sp;
 2581         int error;
 2582 
 2583         if (nblks == 0)
 2584                 return (ENXIO);
 2585         mtx_lock(&sw_dev_mtx);
 2586         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2587                 if (sp->sw_id == vp) {
 2588                         mtx_unlock(&sw_dev_mtx);
 2589                         return (EBUSY);
 2590                 }
 2591         }
 2592         mtx_unlock(&sw_dev_mtx);
 2593     
 2594         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2595 #ifdef MAC
 2596         error = mac_check_system_swapon(td->td_ucred, vp);
 2597         if (error == 0)
 2598 #endif
 2599                 error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, -1);
 2600         (void) VOP_UNLOCK(vp, 0, td);
 2601         if (error)
 2602                 return (error);
 2603 
 2604         swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 2605             NODEV);
 2606         return (0);
 2607 }

Cache object: 32bc6f2cb4ce7b7283815d34a2949f76


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.