The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/swap_pager.c

Version: -  FREEBSD  -  FREEBSD11  -  FREEBSD10  -  FREEBSD9  -  FREEBSD92  -  FREEBSD91  -  FREEBSD90  -  FREEBSD8  -  FREEBSD82  -  FREEBSD81  -  FREEBSD80  -  FREEBSD7  -  FREEBSD74  -  FREEBSD73  -  FREEBSD72  -  FREEBSD71  -  FREEBSD70  -  FREEBSD6  -  FREEBSD64  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1998 Matthew Dillon,
    3  * Copyright (c) 1994 John S. Dyson
    4  * Copyright (c) 1990 University of Utah.
    5  * Copyright (c) 1982, 1986, 1989, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * the Systems Programming Group of the University of Utah Computer
   10  * Science Department.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. All advertising materials mentioning features or use of this software
   21  *    must display the following acknowledgement:
   22  *      This product includes software developed by the University of
   23  *      California, Berkeley and its contributors.
   24  * 4. Neither the name of the University nor the names of its contributors
   25  *    may be used to endorse or promote products derived from this software
   26  *    without specific prior written permission.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   38  * SUCH DAMAGE.
   39  *
   40  *                              New Swap System
   41  *                              Matthew Dillon
   42  *
   43  * Radix Bitmap 'blists'.
   44  *
   45  *      - The new swapper uses the new radix bitmap code.  This should scale
   46  *        to arbitrarily small or arbitrarily large swap spaces and an almost
   47  *        arbitrary degree of fragmentation.
   48  *
   49  * Features:
   50  *
   51  *      - on the fly reallocation of swap during putpages.  The new system
   52  *        does not try to keep previously allocated swap blocks for dirty
   53  *        pages.  
   54  *
   55  *      - on the fly deallocation of swap
   56  *
   57  *      - No more garbage collection required.  Unnecessarily allocated swap
   58  *        blocks only exist for dirty vm_page_t's now and these are already
   59  *        cycled (in a high-load system) by the pager.  We also do on-the-fly
   60  *        removal of invalidated swap blocks when a page is destroyed
   61  *        or renamed.
   62  *
   63  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
   64  *
   65  *      @(#)swap_pager.c        8.9 (Berkeley) 3/21/94
   66  *      @(#)vm_swap.c   8.5 (Berkeley) 2/17/94
   67  */
   68 
   69 #include <sys/cdefs.h>
   70 __FBSDID("$FreeBSD: src/sys/vm/swap_pager.c,v 1.273.2.1 2005/08/20 06:07:55 alc Exp $");
   71 
   72 #include "opt_mac.h"
   73 #include "opt_swap.h"
   74 #include "opt_vm.h"
   75 
   76 #include <sys/param.h>
   77 #include <sys/systm.h>
   78 #include <sys/conf.h>
   79 #include <sys/kernel.h>
   80 #include <sys/proc.h>
   81 #include <sys/bio.h>
   82 #include <sys/buf.h>
   83 #include <sys/disk.h>
   84 #include <sys/fcntl.h>
   85 #include <sys/mount.h>
   86 #include <sys/namei.h>
   87 #include <sys/vnode.h>
   88 #include <sys/mac.h>
   89 #include <sys/malloc.h>
   90 #include <sys/sysctl.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/blist.h>
   93 #include <sys/lock.h>
   94 #include <sys/sx.h>
   95 #include <sys/vmmeter.h>
   96 
   97 #include <vm/vm.h>
   98 #include <vm/pmap.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_kern.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_page.h>
  103 #include <vm/vm_pager.h>
  104 #include <vm/vm_pageout.h>
  105 #include <vm/vm_param.h>
  106 #include <vm/swap_pager.h>
  107 #include <vm/vm_extern.h>
  108 #include <vm/uma.h>
  109 
  110 #include <geom/geom.h>
  111 
  112 /*
  113  * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, or 16
  114  * pages per allocation.  We recommend you stick with the default of 8.
  115  * The 16-page limit is due to the radix code (kern/subr_blist.c).
  116  */
  117 #ifndef MAX_PAGEOUT_CLUSTER
  118 #define MAX_PAGEOUT_CLUSTER 16
  119 #endif
  120 
  121 #if !defined(SWB_NPAGES)
  122 #define SWB_NPAGES      MAX_PAGEOUT_CLUSTER
  123 #endif
  124 
  125 /*
  126  * Piecemeal swap metadata structure.  Swap is stored in a radix tree.
  127  *
  128  * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix
  129  * is basically 8.  Assuming PAGE_SIZE == 4096, one tree level represents
  130  * 32K worth of data, two levels represent 256K, three levels represent
  131  * 2 MBytes.   This is acceptable.
  132  *
  133  * Overall memory utilization is about the same as the old swap structure.
  134  */
  135 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
  136 #define SWAP_META_PAGES         (SWB_NPAGES * 2)
  137 #define SWAP_META_MASK          (SWAP_META_PAGES - 1)
  138 
  139 typedef int32_t swblk_t;        /*
  140                                  * swap offset.  This is the type used to
  141                                  * address the "virtual swap device" and
  142                                  * therefore the maximum swap space is
  143                                  * 2^32 pages.
  144                                  */
  145 
  146 struct swdevt;
  147 typedef void sw_strategy_t(struct buf *bp, struct swdevt *sw);
  148 typedef void sw_close_t(struct thread *td, struct swdevt *sw);
  149 
  150 /*
  151  * Swap device table
  152  */
  153 struct swdevt {
  154         int     sw_flags;
  155         int     sw_nblks;
  156         int     sw_used;
  157         dev_t   sw_dev;
  158         struct vnode *sw_vp;
  159         void    *sw_id;
  160         swblk_t sw_first;
  161         swblk_t sw_end;
  162         struct blist *sw_blist;
  163         TAILQ_ENTRY(swdevt)     sw_list;
  164         sw_strategy_t           *sw_strategy;
  165         sw_close_t              *sw_close;
  166 };
  167 
  168 #define SW_CLOSING      0x04
  169 
  170 struct swblock {
  171         struct swblock  *swb_hnext;
  172         vm_object_t     swb_object;
  173         vm_pindex_t     swb_index;
  174         int             swb_count;
  175         daddr_t         swb_pages[SWAP_META_PAGES];
  176 };
  177 
  178 static struct mtx sw_dev_mtx;
  179 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
  180 static struct swdevt *swdevhd;  /* Allocate from here next */
  181 static int nswapdev;            /* Number of swap devices */
  182 int swap_pager_avail;
  183 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
  184 
  185 static void swapdev_strategy(struct buf *, struct swdevt *sw);
  186 
  187 #define SWM_FREE        0x02    /* free, period                 */
  188 #define SWM_POP         0x04    /* pop out                      */
  189 
  190 int swap_pager_full = 2;        /* swap space exhaustion (task killing) */
  191 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
  192 static int nsw_rcount;          /* free read buffers                    */
  193 static int nsw_wcount_sync;     /* limit write buffers / synchronous    */
  194 static int nsw_wcount_async;    /* limit write buffers / asynchronous   */
  195 static int nsw_wcount_async_max;/* assigned maximum                     */
  196 static int nsw_cluster_max;     /* maximum VOP I/O allowed              */
  197 
  198 static struct swblock **swhash;
  199 static int swhash_mask;
  200 static struct mtx swhash_mtx;
  201 
  202 static int swap_async_max = 4;  /* maximum in-progress async I/O's      */
  203 static struct sx sw_alloc_sx;
  204 
  205 
  206 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
  207         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
  208 
  209 /*
  210  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  211  * of searching a named list by hashing it just a little.
  212  */
  213 
  214 #define NOBJLISTS               8
  215 
  216 #define NOBJLIST(handle)        \
  217         (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
  218 
  219 static struct mtx sw_alloc_mtx; /* protect list manipulation */ 
  220 static struct pagerlst  swap_pager_object_list[NOBJLISTS];
  221 static uma_zone_t       swap_zone;
  222 static struct vm_object swap_zone_obj;
  223 
  224 /*
  225  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  226  * calls hooked from other parts of the VM system and do not appear here.
  227  * (see vm/swap_pager.h).
  228  */
  229 static vm_object_t
  230                 swap_pager_alloc(void *handle, vm_ooffset_t size,
  231                                       vm_prot_t prot, vm_ooffset_t offset);
  232 static void     swap_pager_dealloc(vm_object_t object);
  233 static int      swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
  234 static void     swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
  235 static boolean_t
  236                 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
  237 static void     swap_pager_init(void);
  238 static void     swap_pager_unswapped(vm_page_t);
  239 static void     swap_pager_swapoff(struct swdevt *sp);
  240 
  241 struct pagerops swappagerops = {
  242         .pgo_init =     swap_pager_init,        /* early system initialization of pager */
  243         .pgo_alloc =    swap_pager_alloc,       /* allocate an OBJT_SWAP object         */
  244         .pgo_dealloc =  swap_pager_dealloc,     /* deallocate an OBJT_SWAP object       */
  245         .pgo_getpages = swap_pager_getpages,    /* pagein                               */
  246         .pgo_putpages = swap_pager_putpages,    /* pageout                              */
  247         .pgo_haspage =  swap_pager_haspage,     /* get backing store status for page    */
  248         .pgo_pageunswapped = swap_pager_unswapped,      /* remove swap related to page          */
  249 };
  250 
  251 /*
  252  * dmmax is in page-sized chunks with the new swap system.  It was
  253  * dev-bsized chunks in the old.  dmmax is always a power of 2.
  254  *
  255  * swap_*() routines are externally accessible.  swp_*() routines are
  256  * internal.
  257  */
  258 static int dmmax;
  259 static int nswap_lowat = 128;   /* in pages, swap_pager_almost_full warn */
  260 static int nswap_hiwat = 512;   /* in pages, swap_pager_almost_full warn */
  261 
  262 SYSCTL_INT(_vm, OID_AUTO, dmmax,
  263         CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
  264 
  265 static void     swp_sizecheck(void);
  266 static void     swp_pager_async_iodone(struct buf *bp);
  267 static int      swapongeom(struct thread *, struct vnode *);
  268 static int      swaponvp(struct thread *, struct vnode *, u_long);
  269 
  270 /*
  271  * Swap bitmap functions
  272  */
  273 static void     swp_pager_freeswapspace(daddr_t blk, int npages);
  274 static daddr_t  swp_pager_getswapspace(int npages);
  275 
  276 /*
  277  * Metadata functions
  278  */
  279 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
  280 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
  281 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
  282 static void swp_pager_meta_free_all(vm_object_t);
  283 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
  284 
  285 /*
  286  * SWP_SIZECHECK() -    update swap_pager_full indication
  287  *      
  288  *      update the swap_pager_almost_full indication and warn when we are
  289  *      about to run out of swap space, using lowat/hiwat hysteresis.
  290  *
  291  *      Clear swap_pager_full ( task killing ) indication when lowat is met.
  292  *
  293  *      No restrictions on call
  294  *      This routine may not block.
  295  *      This routine must be called at splvm()
  296  */
  297 static void
  298 swp_sizecheck(void)
  299 {
  300 
  301         if (swap_pager_avail < nswap_lowat) {
  302                 if (swap_pager_almost_full == 0) {
  303                         printf("swap_pager: out of swap space\n");
  304                         swap_pager_almost_full = 1;
  305                 }
  306         } else {
  307                 swap_pager_full = 0;
  308                 if (swap_pager_avail > nswap_hiwat)
  309                         swap_pager_almost_full = 0;
  310         }
  311 }
  312 
  313 /*
  314  * SWP_PAGER_HASH() -   hash swap meta data
  315  *
  316  *      This is an helper function which hashes the swapblk given
  317  *      the object and page index.  It returns a pointer to a pointer
  318  *      to the object, or a pointer to a NULL pointer if it could not
  319  *      find a swapblk.
  320  *
  321  *      This routine must be called at splvm().
  322  */
  323 static struct swblock **
  324 swp_pager_hash(vm_object_t object, vm_pindex_t index)
  325 {
  326         struct swblock **pswap;
  327         struct swblock *swap;
  328 
  329         index &= ~(vm_pindex_t)SWAP_META_MASK;
  330         pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
  331         while ((swap = *pswap) != NULL) {
  332                 if (swap->swb_object == object &&
  333                     swap->swb_index == index
  334                 ) {
  335                         break;
  336                 }
  337                 pswap = &swap->swb_hnext;
  338         }
  339         return (pswap);
  340 }
  341 
  342 /*
  343  * SWAP_PAGER_INIT() -  initialize the swap pager!
  344  *
  345  *      Expected to be started from system init.  NOTE:  This code is run 
  346  *      before much else so be careful what you depend on.  Most of the VM
  347  *      system has yet to be initialized at this point.
  348  */
  349 static void
  350 swap_pager_init(void)
  351 {
  352         /*
  353          * Initialize object lists
  354          */
  355         int i;
  356 
  357         for (i = 0; i < NOBJLISTS; ++i)
  358                 TAILQ_INIT(&swap_pager_object_list[i]);
  359         mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
  360         mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
  361 
  362         /*
  363          * Device Stripe, in PAGE_SIZE'd blocks
  364          */
  365         dmmax = SWB_NPAGES * 2;
  366 }
  367 
  368 /*
  369  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  370  *
  371  *      Expected to be started from pageout process once, prior to entering
  372  *      its main loop.
  373  */
  374 void
  375 swap_pager_swap_init(void)
  376 {
  377         int n, n2;
  378 
  379         /*
  380          * Number of in-transit swap bp operations.  Don't
  381          * exhaust the pbufs completely.  Make sure we
  382          * initialize workable values (0 will work for hysteresis
  383          * but it isn't very efficient).
  384          *
  385          * The nsw_cluster_max is constrained by the bp->b_pages[]
  386          * array (MAXPHYS/PAGE_SIZE) and our locally defined
  387          * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
  388          * constrained by the swap device interleave stripe size.
  389          *
  390          * Currently we hardwire nsw_wcount_async to 4.  This limit is 
  391          * designed to prevent other I/O from having high latencies due to
  392          * our pageout I/O.  The value 4 works well for one or two active swap
  393          * devices but is probably a little low if you have more.  Even so,
  394          * a higher value would probably generate only a limited improvement
  395          * with three or four active swap devices since the system does not
  396          * typically have to pageout at extreme bandwidths.   We will want
  397          * at least 2 per swap devices, and 4 is a pretty good value if you
  398          * have one NFS swap device due to the command/ack latency over NFS.
  399          * So it all works out pretty well.
  400          */
  401         nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
  402 
  403         mtx_lock(&pbuf_mtx);
  404         nsw_rcount = (nswbuf + 1) / 2;
  405         nsw_wcount_sync = (nswbuf + 3) / 4;
  406         nsw_wcount_async = 4;
  407         nsw_wcount_async_max = nsw_wcount_async;
  408         mtx_unlock(&pbuf_mtx);
  409 
  410         /*
  411          * Initialize our zone.  Right now I'm just guessing on the number
  412          * we need based on the number of pages in the system.  Each swblock
  413          * can hold 16 pages, so this is probably overkill.  This reservation
  414          * is typically limited to around 32MB by default.
  415          */
  416         n = cnt.v_page_count / 2;
  417         if (maxswzone && n > maxswzone / sizeof(struct swblock))
  418                 n = maxswzone / sizeof(struct swblock);
  419         n2 = n;
  420         swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
  421             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
  422         if (swap_zone == NULL)
  423                 panic("failed to create swap_zone.");
  424         do {
  425                 if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
  426                         break;
  427                 /*
  428                  * if the allocation failed, try a zone two thirds the
  429                  * size of the previous attempt.
  430                  */
  431                 n -= ((n + 2) / 3);
  432         } while (n > 0);
  433         if (n2 != n)
  434                 printf("Swap zone entries reduced from %d to %d.\n", n2, n);
  435         n2 = n;
  436 
  437         /*
  438          * Initialize our meta-data hash table.  The swapper does not need to
  439          * be quite as efficient as the VM system, so we do not use an 
  440          * oversized hash table.
  441          *
  442          *      n:              size of hash table, must be power of 2
  443          *      swhash_mask:    hash table index mask
  444          */
  445         for (n = 1; n < n2 / 8; n *= 2)
  446                 ;
  447         swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
  448         swhash_mask = n - 1;
  449         mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
  450 }
  451 
  452 /*
  453  * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
  454  *                      its metadata structures.
  455  *
  456  *      This routine is called from the mmap and fork code to create a new
  457  *      OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
  458  *      and then converting it with swp_pager_meta_build().
  459  *
  460  *      This routine may block in vm_object_allocate() and create a named
  461  *      object lookup race, so we must interlock.   We must also run at
  462  *      splvm() for the object lookup to handle races with interrupts, but
  463  *      we do not have to maintain splvm() in between the lookup and the
  464  *      add because (I believe) it is not possible to attempt to create
  465  *      a new swap object w/handle when a default object with that handle
  466  *      already exists.
  467  *
  468  * MPSAFE
  469  */
  470 static vm_object_t
  471 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
  472                  vm_ooffset_t offset)
  473 {
  474         vm_object_t object;
  475         vm_pindex_t pindex;
  476 
  477         pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
  478 
  479         if (handle) {
  480                 mtx_lock(&Giant);
  481                 /*
  482                  * Reference existing named region or allocate new one.  There
  483                  * should not be a race here against swp_pager_meta_build()
  484                  * as called from vm_page_remove() in regards to the lookup
  485                  * of the handle.
  486                  */
  487                 sx_xlock(&sw_alloc_sx);
  488                 object = vm_pager_object_lookup(NOBJLIST(handle), handle);
  489 
  490                 if (object != NULL) {
  491                         vm_object_reference(object);
  492                 } else {
  493                         object = vm_object_allocate(OBJT_DEFAULT, pindex);
  494                         object->handle = handle;
  495 
  496                         VM_OBJECT_LOCK(object);
  497                         swp_pager_meta_build(object, 0, SWAPBLK_NONE);
  498                         VM_OBJECT_UNLOCK(object);
  499                 }
  500                 sx_xunlock(&sw_alloc_sx);
  501                 mtx_unlock(&Giant);
  502         } else {
  503                 object = vm_object_allocate(OBJT_DEFAULT, pindex);
  504 
  505                 VM_OBJECT_LOCK(object);
  506                 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
  507                 VM_OBJECT_UNLOCK(object);
  508         }
  509         return (object);
  510 }
  511 
  512 /*
  513  * SWAP_PAGER_DEALLOC() -       remove swap metadata from object
  514  *
  515  *      The swap backing for the object is destroyed.  The code is 
  516  *      designed such that we can reinstantiate it later, but this
  517  *      routine is typically called only when the entire object is
  518  *      about to be destroyed.
  519  *
  520  *      This routine may block, but no longer does. 
  521  *
  522  *      The object must be locked or unreferenceable.
  523  */
  524 static void
  525 swap_pager_dealloc(vm_object_t object)
  526 {
  527 
  528         /*
  529          * Remove from list right away so lookups will fail if we block for
  530          * pageout completion.
  531          */
  532         if (object->handle != NULL) {
  533                 mtx_lock(&sw_alloc_mtx);
  534                 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
  535                 mtx_unlock(&sw_alloc_mtx);
  536         }
  537 
  538         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  539         vm_object_pip_wait(object, "swpdea");
  540 
  541         /*
  542          * Free all remaining metadata.  We only bother to free it from 
  543          * the swap meta data.  We do not attempt to free swapblk's still
  544          * associated with vm_page_t's for this object.  We do not care
  545          * if paging is still in progress on some objects.
  546          */
  547         swp_pager_meta_free_all(object);
  548 }
  549 
  550 /************************************************************************
  551  *                      SWAP PAGER BITMAP ROUTINES                      *
  552  ************************************************************************/
  553 
  554 /*
  555  * SWP_PAGER_GETSWAPSPACE() -   allocate raw swap space
  556  *
  557  *      Allocate swap for the requested number of pages.  The starting
  558  *      swap block number (a page index) is returned or SWAPBLK_NONE
  559  *      if the allocation failed.
  560  *
  561  *      Also has the side effect of advising that somebody made a mistake
  562  *      when they configured swap and didn't configure enough.
  563  *
  564  *      Must be called at splvm() to avoid races with bitmap frees from
  565  *      vm_page_remove() aka swap_pager_page_removed().
  566  *
  567  *      This routine may not block
  568  *      This routine must be called at splvm().
  569  *
  570  *      We allocate in round-robin fashion from the configured devices.
  571  */
  572 static daddr_t
  573 swp_pager_getswapspace(int npages)
  574 {
  575         daddr_t blk;
  576         struct swdevt *sp;
  577         int i;
  578 
  579         blk = SWAPBLK_NONE;
  580         mtx_lock(&sw_dev_mtx);
  581         sp = swdevhd;
  582         for (i = 0; i < nswapdev; i++) {
  583                 if (sp == NULL)
  584                         sp = TAILQ_FIRST(&swtailq);
  585                 if (!(sp->sw_flags & SW_CLOSING)) {
  586                         blk = blist_alloc(sp->sw_blist, npages);
  587                         if (blk != SWAPBLK_NONE) {
  588                                 blk += sp->sw_first;
  589                                 sp->sw_used += npages;
  590                                 swap_pager_avail -= npages;
  591                                 swp_sizecheck();
  592                                 swdevhd = TAILQ_NEXT(sp, sw_list);
  593                                 goto done;
  594                         }
  595                 }
  596                 sp = TAILQ_NEXT(sp, sw_list);
  597         }
  598         if (swap_pager_full != 2) {
  599                 printf("swap_pager_getswapspace(%d): failed\n", npages);
  600                 swap_pager_full = 2;
  601                 swap_pager_almost_full = 1;
  602         }
  603         swdevhd = NULL;
  604 done:
  605         mtx_unlock(&sw_dev_mtx);
  606         return (blk);
  607 }
  608 
  609 static int
  610 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
  611 {
  612 
  613         return (blk >= sp->sw_first && blk < sp->sw_end);
  614 }
  615         
  616 static void
  617 swp_pager_strategy(struct buf *bp)
  618 {
  619         struct swdevt *sp;
  620 
  621         mtx_lock(&sw_dev_mtx);
  622         TAILQ_FOREACH(sp, &swtailq, sw_list) {
  623                 if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
  624                         mtx_unlock(&sw_dev_mtx);
  625                         sp->sw_strategy(bp, sp);
  626                         return;
  627                 }
  628         }
  629         panic("Swapdev not found");
  630 }
  631         
  632 
  633 /*
  634  * SWP_PAGER_FREESWAPSPACE() -  free raw swap space 
  635  *
  636  *      This routine returns the specified swap blocks back to the bitmap.
  637  *
  638  *      Note:  This routine may not block (it could in the old swap code),
  639  *      and through the use of the new blist routines it does not block.
  640  *
  641  *      We must be called at splvm() to avoid races with bitmap frees from
  642  *      vm_page_remove() aka swap_pager_page_removed().
  643  *
  644  *      This routine may not block
  645  *      This routine must be called at splvm().
  646  */
  647 static void
  648 swp_pager_freeswapspace(daddr_t blk, int npages)
  649 {
  650         struct swdevt *sp;
  651 
  652         mtx_lock(&sw_dev_mtx);
  653         TAILQ_FOREACH(sp, &swtailq, sw_list) {
  654                 if (blk >= sp->sw_first && blk < sp->sw_end) {
  655                         sp->sw_used -= npages;
  656                         /*
  657                          * If we are attempting to stop swapping on
  658                          * this device, we don't want to mark any
  659                          * blocks free lest they be reused.  
  660                          */
  661                         if ((sp->sw_flags & SW_CLOSING) == 0) {
  662                                 blist_free(sp->sw_blist, blk - sp->sw_first,
  663                                     npages);
  664                                 swap_pager_avail += npages;
  665                                 swp_sizecheck();
  666                         }
  667                         mtx_unlock(&sw_dev_mtx);
  668                         return;
  669                 }
  670         }
  671         panic("Swapdev not found");
  672 }
  673 
  674 /*
  675  * SWAP_PAGER_FREESPACE() -     frees swap blocks associated with a page
  676  *                              range within an object.
  677  *
  678  *      This is a globally accessible routine.
  679  *
  680  *      This routine removes swapblk assignments from swap metadata.
  681  *
  682  *      The external callers of this routine typically have already destroyed 
  683  *      or renamed vm_page_t's associated with this range in the object so 
  684  *      we should be ok.
  685  *
  686  *      This routine may be called at any spl.  We up our spl to splvm temporarily
  687  *      in order to perform the metadata removal.
  688  */
  689 void
  690 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
  691 {
  692 
  693         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  694         swp_pager_meta_free(object, start, size);
  695 }
  696 
  697 /*
  698  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  699  *
  700  *      Assigns swap blocks to the specified range within the object.  The 
  701  *      swap blocks are not zerod.  Any previous swap assignment is destroyed.
  702  *
  703  *      Returns 0 on success, -1 on failure.
  704  */
  705 int
  706 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
  707 {
  708         int n = 0;
  709         daddr_t blk = SWAPBLK_NONE;
  710         vm_pindex_t beg = start;        /* save start index */
  711 
  712         VM_OBJECT_LOCK(object);
  713         while (size) {
  714                 if (n == 0) {
  715                         n = BLIST_MAX_ALLOC;
  716                         while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
  717                                 n >>= 1;
  718                                 if (n == 0) {
  719                                         swp_pager_meta_free(object, beg, start - beg);
  720                                         VM_OBJECT_UNLOCK(object);
  721                                         return (-1);
  722                                 }
  723                         }
  724                 }
  725                 swp_pager_meta_build(object, start, blk);
  726                 --size;
  727                 ++start;
  728                 ++blk;
  729                 --n;
  730         }
  731         swp_pager_meta_free(object, start, n);
  732         VM_OBJECT_UNLOCK(object);
  733         return (0);
  734 }
  735 
  736 /*
  737  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  738  *                      and destroy the source.
  739  *
  740  *      Copy any valid swapblks from the source to the destination.  In
  741  *      cases where both the source and destination have a valid swapblk,
  742  *      we keep the destination's.
  743  *
  744  *      This routine is allowed to block.  It may block allocating metadata
  745  *      indirectly through swp_pager_meta_build() or if paging is still in
  746  *      progress on the source. 
  747  *
  748  *      This routine can be called at any spl
  749  *
  750  *      XXX vm_page_collapse() kinda expects us not to block because we 
  751  *      supposedly do not need to allocate memory, but for the moment we
  752  *      *may* have to get a little memory from the zone allocator, but
  753  *      it is taken from the interrupt memory.  We should be ok. 
  754  *
  755  *      The source object contains no vm_page_t's (which is just as well)
  756  *
  757  *      The source object is of type OBJT_SWAP.
  758  *
  759  *      The source and destination objects must be locked or 
  760  *      inaccessible (XXX are they ?)
  761  */
  762 void
  763 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
  764     vm_pindex_t offset, int destroysource)
  765 {
  766         vm_pindex_t i;
  767 
  768         VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
  769         VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
  770 
  771         /*
  772          * If destroysource is set, we remove the source object from the 
  773          * swap_pager internal queue now. 
  774          */
  775         if (destroysource) {
  776                 if (srcobject->handle != NULL) {
  777                         mtx_lock(&sw_alloc_mtx);
  778                         TAILQ_REMOVE(
  779                             NOBJLIST(srcobject->handle),
  780                             srcobject,
  781                             pager_object_list
  782                         );
  783                         mtx_unlock(&sw_alloc_mtx);
  784                 }
  785         }
  786 
  787         /*
  788          * transfer source to destination.
  789          */
  790         for (i = 0; i < dstobject->size; ++i) {
  791                 daddr_t dstaddr;
  792 
  793                 /*
  794                  * Locate (without changing) the swapblk on the destination,
  795                  * unless it is invalid in which case free it silently, or
  796                  * if the destination is a resident page, in which case the
  797                  * source is thrown away.
  798                  */
  799                 dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
  800 
  801                 if (dstaddr == SWAPBLK_NONE) {
  802                         /*
  803                          * Destination has no swapblk and is not resident,
  804                          * copy source.
  805                          */
  806                         daddr_t srcaddr;
  807 
  808                         srcaddr = swp_pager_meta_ctl(
  809                             srcobject, 
  810                             i + offset,
  811                             SWM_POP
  812                         );
  813 
  814                         if (srcaddr != SWAPBLK_NONE) {
  815                                 /*
  816                                  * swp_pager_meta_build() can sleep.
  817                                  */
  818                                 vm_object_pip_add(srcobject, 1);
  819                                 VM_OBJECT_UNLOCK(srcobject);
  820                                 vm_object_pip_add(dstobject, 1);
  821                                 swp_pager_meta_build(dstobject, i, srcaddr);
  822                                 vm_object_pip_wakeup(dstobject);
  823                                 VM_OBJECT_LOCK(srcobject);
  824                                 vm_object_pip_wakeup(srcobject);
  825                         }
  826                 } else {
  827                         /*
  828                          * Destination has valid swapblk or it is represented
  829                          * by a resident page.  We destroy the sourceblock.
  830                          */
  831                         
  832                         swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
  833                 }
  834         }
  835 
  836         /*
  837          * Free left over swap blocks in source.
  838          *
  839          * We have to revert the type to OBJT_DEFAULT so we do not accidently
  840          * double-remove the object from the swap queues.
  841          */
  842         if (destroysource) {
  843                 swp_pager_meta_free_all(srcobject);
  844                 /*
  845                  * Reverting the type is not necessary, the caller is going
  846                  * to destroy srcobject directly, but I'm doing it here
  847                  * for consistency since we've removed the object from its
  848                  * queues.
  849                  */
  850                 srcobject->type = OBJT_DEFAULT;
  851         }
  852 }
  853 
  854 /*
  855  * SWAP_PAGER_HASPAGE() -       determine if we have good backing store for
  856  *                              the requested page.
  857  *
  858  *      We determine whether good backing store exists for the requested
  859  *      page and return TRUE if it does, FALSE if it doesn't.
  860  *
  861  *      If TRUE, we also try to determine how much valid, contiguous backing
  862  *      store exists before and after the requested page within a reasonable
  863  *      distance.  We do not try to restrict it to the swap device stripe
  864  *      (that is handled in getpages/putpages).  It probably isn't worth
  865  *      doing here.
  866  */
  867 static boolean_t
  868 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
  869 {
  870         daddr_t blk0;
  871 
  872         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  873         /*
  874          * do we have good backing store at the requested index ?
  875          */
  876         blk0 = swp_pager_meta_ctl(object, pindex, 0);
  877 
  878         if (blk0 == SWAPBLK_NONE) {
  879                 if (before)
  880                         *before = 0;
  881                 if (after)
  882                         *after = 0;
  883                 return (FALSE);
  884         }
  885 
  886         /*
  887          * find backwards-looking contiguous good backing store
  888          */
  889         if (before != NULL) {
  890                 int i;
  891 
  892                 for (i = 1; i < (SWB_NPAGES/2); ++i) {
  893                         daddr_t blk;
  894 
  895                         if (i > pindex)
  896                                 break;
  897                         blk = swp_pager_meta_ctl(object, pindex - i, 0);
  898                         if (blk != blk0 - i)
  899                                 break;
  900                 }
  901                 *before = (i - 1);
  902         }
  903 
  904         /*
  905          * find forward-looking contiguous good backing store
  906          */
  907         if (after != NULL) {
  908                 int i;
  909 
  910                 for (i = 1; i < (SWB_NPAGES/2); ++i) {
  911                         daddr_t blk;
  912 
  913                         blk = swp_pager_meta_ctl(object, pindex + i, 0);
  914                         if (blk != blk0 + i)
  915                                 break;
  916                 }
  917                 *after = (i - 1);
  918         }
  919         return (TRUE);
  920 }
  921 
  922 /*
  923  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  924  *
  925  *      This removes any associated swap backing store, whether valid or
  926  *      not, from the page.  
  927  *
  928  *      This routine is typically called when a page is made dirty, at
  929  *      which point any associated swap can be freed.  MADV_FREE also
  930  *      calls us in a special-case situation
  931  *
  932  *      NOTE!!!  If the page is clean and the swap was valid, the caller
  933  *      should make the page dirty before calling this routine.  This routine
  934  *      does NOT change the m->dirty status of the page.  Also: MADV_FREE
  935  *      depends on it.
  936  *
  937  *      This routine may not block
  938  *      This routine must be called at splvm()
  939  */
  940 static void
  941 swap_pager_unswapped(vm_page_t m)
  942 {
  943 
  944         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  945         swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
  946 }
  947 
  948 /*
  949  * SWAP_PAGER_GETPAGES() - bring pages in from swap
  950  *
  951  *      Attempt to retrieve (m, count) pages from backing store, but make
  952  *      sure we retrieve at least m[reqpage].  We try to load in as large
  953  *      a chunk surrounding m[reqpage] as is contiguous in swap and which
  954  *      belongs to the same object.
  955  *
  956  *      The code is designed for asynchronous operation and 
  957  *      immediate-notification of 'reqpage' but tends not to be
  958  *      used that way.  Please do not optimize-out this algorithmic
  959  *      feature, I intend to improve on it in the future.
  960  *
  961  *      The parent has a single vm_object_pip_add() reference prior to
  962  *      calling us and we should return with the same.
  963  *
  964  *      The parent has BUSY'd the pages.  We should return with 'm'
  965  *      left busy, but the others adjusted.
  966  */
  967 static int
  968 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
  969 {
  970         struct buf *bp;
  971         vm_page_t mreq;
  972         int i;
  973         int j;
  974         daddr_t blk;
  975 
  976         mreq = m[reqpage];
  977 
  978         KASSERT(mreq->object == object,
  979             ("swap_pager_getpages: object mismatch %p/%p",
  980             object, mreq->object));
  981 
  982         /*
  983          * Calculate range to retrieve.  The pages have already been assigned
  984          * their swapblks.  We require a *contiguous* range but we know it to
  985          * not span devices.   If we do not supply it, bad things
  986          * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 
  987          * loops are set up such that the case(s) are handled implicitly.
  988          *
  989          * The swp_*() calls must be made at splvm().  vm_page_free() does
  990          * not need to be, but it will go a little faster if it is.
  991          */
  992         blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
  993 
  994         for (i = reqpage - 1; i >= 0; --i) {
  995                 daddr_t iblk;
  996 
  997                 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
  998                 if (blk != iblk + (reqpage - i))
  999                         break;
 1000         }
 1001         ++i;
 1002 
 1003         for (j = reqpage + 1; j < count; ++j) {
 1004                 daddr_t jblk;
 1005 
 1006                 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
 1007                 if (blk != jblk - (j - reqpage))
 1008                         break;
 1009         }
 1010 
 1011         /*
 1012          * free pages outside our collection range.   Note: we never free
 1013          * mreq, it must remain busy throughout.
 1014          */
 1015         if (0 < i || j < count) {
 1016                 int k;
 1017 
 1018                 vm_page_lock_queues();
 1019                 for (k = 0; k < i; ++k)
 1020                         vm_page_free(m[k]);
 1021                 for (k = j; k < count; ++k)
 1022                         vm_page_free(m[k]);
 1023                 vm_page_unlock_queues();
 1024         }
 1025 
 1026         /*
 1027          * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq 
 1028          * still busy, but the others unbusied.
 1029          */
 1030         if (blk == SWAPBLK_NONE)
 1031                 return (VM_PAGER_FAIL);
 1032 
 1033         /*
 1034          * Getpbuf() can sleep.
 1035          */
 1036         VM_OBJECT_UNLOCK(object);
 1037         /*
 1038          * Get a swap buffer header to perform the IO
 1039          */
 1040         bp = getpbuf(&nsw_rcount);
 1041         bp->b_flags |= B_PAGING;
 1042 
 1043         /*
 1044          * map our page(s) into kva for input
 1045          */
 1046         pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
 1047 
 1048         bp->b_iocmd = BIO_READ;
 1049         bp->b_iodone = swp_pager_async_iodone;
 1050         bp->b_rcred = crhold(thread0.td_ucred);
 1051         bp->b_wcred = crhold(thread0.td_ucred);
 1052         bp->b_blkno = blk - (reqpage - i);
 1053         bp->b_bcount = PAGE_SIZE * (j - i);
 1054         bp->b_bufsize = PAGE_SIZE * (j - i);
 1055         bp->b_pager.pg_reqpage = reqpage - i;
 1056 
 1057         VM_OBJECT_LOCK(object);
 1058         vm_page_lock_queues();
 1059         {
 1060                 int k;
 1061 
 1062                 for (k = i; k < j; ++k) {
 1063                         bp->b_pages[k - i] = m[k];
 1064                         vm_page_flag_set(m[k], PG_SWAPINPROG);
 1065                 }
 1066         }
 1067         vm_page_unlock_queues();
 1068         bp->b_npages = j - i;
 1069 
 1070         cnt.v_swapin++;
 1071         cnt.v_swappgsin += bp->b_npages;
 1072 
 1073         /*
 1074          * We still hold the lock on mreq, and our automatic completion routine
 1075          * does not remove it.
 1076          */
 1077         vm_object_pip_add(object, bp->b_npages);
 1078         VM_OBJECT_UNLOCK(object);
 1079 
 1080         /*
 1081          * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 1082          * this point because we automatically release it on completion.
 1083          * Instead, we look at the one page we are interested in which we
 1084          * still hold a lock on even through the I/O completion.
 1085          *
 1086          * The other pages in our m[] array are also released on completion,
 1087          * so we cannot assume they are valid anymore either.
 1088          *
 1089          * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1090          */
 1091         BUF_KERNPROC(bp);
 1092         swp_pager_strategy(bp);
 1093 
 1094         /*
 1095          * wait for the page we want to complete.  PG_SWAPINPROG is always
 1096          * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 1097          * is set in the meta-data.
 1098          */
 1099         vm_page_lock_queues();
 1100         while ((mreq->flags & PG_SWAPINPROG) != 0) {
 1101                 vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
 1102                 cnt.v_intrans++;
 1103                 if (msleep(mreq, &vm_page_queue_mtx, PSWP, "swread", hz*20)) {
 1104                         printf(
 1105 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 1106                             bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 1107                 }
 1108         }
 1109         vm_page_unlock_queues();
 1110 
 1111         VM_OBJECT_LOCK(object);
 1112         /*
 1113          * mreq is left busied after completion, but all the other pages
 1114          * are freed.  If we had an unrecoverable read error the page will
 1115          * not be valid.
 1116          */
 1117         if (mreq->valid != VM_PAGE_BITS_ALL) {
 1118                 return (VM_PAGER_ERROR);
 1119         } else {
 1120                 return (VM_PAGER_OK);
 1121         }
 1122 
 1123         /*
 1124          * A final note: in a low swap situation, we cannot deallocate swap
 1125          * and mark a page dirty here because the caller is likely to mark
 1126          * the page clean when we return, causing the page to possibly revert 
 1127          * to all-zero's later.
 1128          */
 1129 }
 1130 
 1131 /*
 1132  *      swap_pager_putpages: 
 1133  *
 1134  *      Assign swap (if necessary) and initiate I/O on the specified pages.
 1135  *
 1136  *      We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
 1137  *      are automatically converted to SWAP objects.
 1138  *
 1139  *      In a low memory situation we may block in VOP_STRATEGY(), but the new 
 1140  *      vm_page reservation system coupled with properly written VFS devices 
 1141  *      should ensure that no low-memory deadlock occurs.  This is an area
 1142  *      which needs work.
 1143  *
 1144  *      The parent has N vm_object_pip_add() references prior to
 1145  *      calling us and will remove references for rtvals[] that are
 1146  *      not set to VM_PAGER_PEND.  We need to remove the rest on I/O
 1147  *      completion.
 1148  *
 1149  *      The parent has soft-busy'd the pages it passes us and will unbusy
 1150  *      those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
 1151  *      We need to unbusy the rest on I/O completion.
 1152  */
 1153 void
 1154 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
 1155     boolean_t sync, int *rtvals)
 1156 {
 1157         int i;
 1158         int n = 0;
 1159 
 1160         GIANT_REQUIRED;
 1161         if (count && m[0]->object != object) {
 1162                 panic("swap_pager_getpages: object mismatch %p/%p", 
 1163                     object, 
 1164                     m[0]->object
 1165                 );
 1166         }
 1167 
 1168         /*
 1169          * Step 1
 1170          *
 1171          * Turn object into OBJT_SWAP
 1172          * check for bogus sysops
 1173          * force sync if not pageout process
 1174          */
 1175         if (object->type != OBJT_SWAP)
 1176                 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 1177         VM_OBJECT_UNLOCK(object);
 1178 
 1179         if (curproc != pageproc)
 1180                 sync = TRUE;
 1181 
 1182         /*
 1183          * Step 2
 1184          *
 1185          * Update nsw parameters from swap_async_max sysctl values.  
 1186          * Do not let the sysop crash the machine with bogus numbers.
 1187          */
 1188         mtx_lock(&pbuf_mtx);
 1189         if (swap_async_max != nsw_wcount_async_max) {
 1190                 int n;
 1191 
 1192                 /*
 1193                  * limit range
 1194                  */
 1195                 if ((n = swap_async_max) > nswbuf / 2)
 1196                         n = nswbuf / 2;
 1197                 if (n < 1)
 1198                         n = 1;
 1199                 swap_async_max = n;
 1200 
 1201                 /*
 1202                  * Adjust difference ( if possible ).  If the current async
 1203                  * count is too low, we may not be able to make the adjustment
 1204                  * at this time.
 1205                  */
 1206                 n -= nsw_wcount_async_max;
 1207                 if (nsw_wcount_async + n >= 0) {
 1208                         nsw_wcount_async += n;
 1209                         nsw_wcount_async_max += n;
 1210                         wakeup(&nsw_wcount_async);
 1211                 }
 1212         }
 1213         mtx_unlock(&pbuf_mtx);
 1214 
 1215         /*
 1216          * Step 3
 1217          *
 1218          * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 1219          * The page is left dirty until the pageout operation completes
 1220          * successfully.
 1221          */
 1222         for (i = 0; i < count; i += n) {
 1223                 int j;
 1224                 struct buf *bp;
 1225                 daddr_t blk;
 1226 
 1227                 /*
 1228                  * Maximum I/O size is limited by a number of factors.
 1229                  */
 1230                 n = min(BLIST_MAX_ALLOC, count - i);
 1231                 n = min(n, nsw_cluster_max);
 1232 
 1233                 /*
 1234                  * Get biggest block of swap we can.  If we fail, fall
 1235                  * back and try to allocate a smaller block.  Don't go
 1236                  * overboard trying to allocate space if it would overly
 1237                  * fragment swap.
 1238                  */
 1239                 while (
 1240                     (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
 1241                     n > 4
 1242                 ) {
 1243                         n >>= 1;
 1244                 }
 1245                 if (blk == SWAPBLK_NONE) {
 1246                         for (j = 0; j < n; ++j)
 1247                                 rtvals[i+j] = VM_PAGER_FAIL;
 1248                         continue;
 1249                 }
 1250 
 1251                 /*
 1252                  * All I/O parameters have been satisfied, build the I/O
 1253                  * request and assign the swap space.
 1254                  */
 1255                 if (sync == TRUE) {
 1256                         bp = getpbuf(&nsw_wcount_sync);
 1257                 } else {
 1258                         bp = getpbuf(&nsw_wcount_async);
 1259                         bp->b_flags = B_ASYNC;
 1260                 }
 1261                 bp->b_flags |= B_PAGING;
 1262                 bp->b_iocmd = BIO_WRITE;
 1263 
 1264                 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 1265 
 1266                 bp->b_rcred = crhold(thread0.td_ucred);
 1267                 bp->b_wcred = crhold(thread0.td_ucred);
 1268                 bp->b_bcount = PAGE_SIZE * n;
 1269                 bp->b_bufsize = PAGE_SIZE * n;
 1270                 bp->b_blkno = blk;
 1271 
 1272                 VM_OBJECT_LOCK(object);
 1273                 for (j = 0; j < n; ++j) {
 1274                         vm_page_t mreq = m[i+j];
 1275 
 1276                         swp_pager_meta_build(
 1277                             mreq->object, 
 1278                             mreq->pindex,
 1279                             blk + j
 1280                         );
 1281                         vm_page_dirty(mreq);
 1282                         rtvals[i+j] = VM_PAGER_OK;
 1283 
 1284                         vm_page_lock_queues();
 1285                         vm_page_flag_set(mreq, PG_SWAPINPROG);
 1286                         vm_page_unlock_queues();
 1287                         bp->b_pages[j] = mreq;
 1288                 }
 1289                 VM_OBJECT_UNLOCK(object);
 1290                 bp->b_npages = n;
 1291                 /*
 1292                  * Must set dirty range for NFS to work.
 1293                  */
 1294                 bp->b_dirtyoff = 0;
 1295                 bp->b_dirtyend = bp->b_bcount;
 1296 
 1297                 cnt.v_swapout++;
 1298                 cnt.v_swappgsout += bp->b_npages;
 1299 
 1300                 /*
 1301                  * asynchronous
 1302                  *
 1303                  * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1304                  */
 1305                 if (sync == FALSE) {
 1306                         bp->b_iodone = swp_pager_async_iodone;
 1307                         BUF_KERNPROC(bp);
 1308                         swp_pager_strategy(bp);
 1309 
 1310                         for (j = 0; j < n; ++j)
 1311                                 rtvals[i+j] = VM_PAGER_PEND;
 1312                         /* restart outter loop */
 1313                         continue;
 1314                 }
 1315 
 1316                 /*
 1317                  * synchronous
 1318                  *
 1319                  * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 1320                  */
 1321                 bp->b_iodone = bdone;
 1322                 swp_pager_strategy(bp);
 1323 
 1324                 /*
 1325                  * Wait for the sync I/O to complete, then update rtvals.
 1326                  * We just set the rtvals[] to VM_PAGER_PEND so we can call
 1327                  * our async completion routine at the end, thus avoiding a
 1328                  * double-free.
 1329                  */
 1330                 bwait(bp, PVM, "swwrt");
 1331                 for (j = 0; j < n; ++j)
 1332                         rtvals[i+j] = VM_PAGER_PEND;
 1333                 /*
 1334                  * Now that we are through with the bp, we can call the
 1335                  * normal async completion, which frees everything up.
 1336                  */
 1337                 swp_pager_async_iodone(bp);
 1338         }
 1339         VM_OBJECT_LOCK(object);
 1340 }
 1341 
 1342 /*
 1343  *      swp_pager_async_iodone:
 1344  *
 1345  *      Completion routine for asynchronous reads and writes from/to swap.
 1346  *      Also called manually by synchronous code to finish up a bp.
 1347  *
 1348  *      For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
 1349  *      the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
 1350  *      unbusy all pages except the 'main' request page.  For WRITE 
 1351  *      operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
 1352  *      because we marked them all VM_PAGER_PEND on return from putpages ).
 1353  *
 1354  *      This routine may not block.
 1355  *      This routine is called at splbio() or better
 1356  *
 1357  *      We up ourselves to splvm() as required for various vm_page related
 1358  *      calls.
 1359  */
 1360 static void
 1361 swp_pager_async_iodone(struct buf *bp)
 1362 {
 1363         int i;
 1364         vm_object_t object = NULL;
 1365 
 1366         /*
 1367          * report error
 1368          */
 1369         if (bp->b_ioflags & BIO_ERROR) {
 1370                 printf(
 1371                     "swap_pager: I/O error - %s failed; blkno %ld,"
 1372                         "size %ld, error %d\n",
 1373                     ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 1374                     (long)bp->b_blkno, 
 1375                     (long)bp->b_bcount,
 1376                     bp->b_error
 1377                 );
 1378         }
 1379 
 1380         /*
 1381          * remove the mapping for kernel virtual
 1382          */
 1383         pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 1384 
 1385         if (bp->b_npages) {
 1386                 object = bp->b_pages[0]->object;
 1387                 VM_OBJECT_LOCK(object);
 1388         }
 1389         vm_page_lock_queues();
 1390         /*
 1391          * cleanup pages.  If an error occurs writing to swap, we are in
 1392          * very serious trouble.  If it happens to be a disk error, though,
 1393          * we may be able to recover by reassigning the swap later on.  So
 1394          * in this case we remove the m->swapblk assignment for the page 
 1395          * but do not free it in the rlist.  The errornous block(s) are thus
 1396          * never reallocated as swap.  Redirty the page and continue.
 1397          */
 1398         for (i = 0; i < bp->b_npages; ++i) {
 1399                 vm_page_t m = bp->b_pages[i];
 1400 
 1401                 vm_page_flag_clear(m, PG_SWAPINPROG);
 1402 
 1403                 if (bp->b_ioflags & BIO_ERROR) {
 1404                         /*
 1405                          * If an error occurs I'd love to throw the swapblk
 1406                          * away without freeing it back to swapspace, so it
 1407                          * can never be used again.  But I can't from an 
 1408                          * interrupt.
 1409                          */
 1410                         if (bp->b_iocmd == BIO_READ) {
 1411                                 /*
 1412                                  * When reading, reqpage needs to stay
 1413                                  * locked for the parent, but all other
 1414                                  * pages can be freed.  We still want to
 1415                                  * wakeup the parent waiting on the page,
 1416                                  * though.  ( also: pg_reqpage can be -1 and 
 1417                                  * not match anything ).
 1418                                  *
 1419                                  * We have to wake specifically requested pages
 1420                                  * up too because we cleared PG_SWAPINPROG and
 1421                                  * someone may be waiting for that.
 1422                                  *
 1423                                  * NOTE: for reads, m->dirty will probably
 1424                                  * be overridden by the original caller of
 1425                                  * getpages so don't play cute tricks here.
 1426                                  *
 1427                                  * XXX IT IS NOT LEGAL TO FREE THE PAGE HERE
 1428                                  * AS THIS MESSES WITH object->memq, and it is
 1429                                  * not legal to mess with object->memq from an
 1430                                  * interrupt.
 1431                                  */
 1432                                 m->valid = 0;
 1433                                 if (i != bp->b_pager.pg_reqpage)
 1434                                         vm_page_free(m);
 1435                                 else
 1436                                         vm_page_flash(m);
 1437                                 /*
 1438                                  * If i == bp->b_pager.pg_reqpage, do not wake 
 1439                                  * the page up.  The caller needs to.
 1440                                  */
 1441                         } else {
 1442                                 /*
 1443                                  * If a write error occurs, reactivate page
 1444                                  * so it doesn't clog the inactive list,
 1445                                  * then finish the I/O.
 1446                                  */
 1447                                 vm_page_dirty(m);
 1448                                 vm_page_activate(m);
 1449                                 vm_page_io_finish(m);
 1450                         }
 1451                 } else if (bp->b_iocmd == BIO_READ) {
 1452                         /*
 1453                          * For read success, clear dirty bits.  Nobody should
 1454                          * have this page mapped but don't take any chances,
 1455                          * make sure the pmap modify bits are also cleared.
 1456                          *
 1457                          * NOTE: for reads, m->dirty will probably be 
 1458                          * overridden by the original caller of getpages so
 1459                          * we cannot set them in order to free the underlying
 1460                          * swap in a low-swap situation.  I don't think we'd
 1461                          * want to do that anyway, but it was an optimization
 1462                          * that existed in the old swapper for a time before
 1463                          * it got ripped out due to precisely this problem.
 1464                          *
 1465                          * If not the requested page then deactivate it.
 1466                          *
 1467                          * Note that the requested page, reqpage, is left
 1468                          * busied, but we still have to wake it up.  The
 1469                          * other pages are released (unbusied) by 
 1470                          * vm_page_wakeup().  We do not set reqpage's
 1471                          * valid bits here, it is up to the caller.
 1472                          */
 1473                         pmap_clear_modify(m);
 1474                         m->valid = VM_PAGE_BITS_ALL;
 1475                         vm_page_undirty(m);
 1476 
 1477                         /*
 1478                          * We have to wake specifically requested pages
 1479                          * up too because we cleared PG_SWAPINPROG and
 1480                          * could be waiting for it in getpages.  However,
 1481                          * be sure to not unbusy getpages specifically
 1482                          * requested page - getpages expects it to be 
 1483                          * left busy.
 1484                          */
 1485                         if (i != bp->b_pager.pg_reqpage) {
 1486                                 vm_page_deactivate(m);
 1487                                 vm_page_wakeup(m);
 1488                         } else {
 1489                                 vm_page_flash(m);
 1490                         }
 1491                 } else {
 1492                         /*
 1493                          * For write success, clear the modify and dirty 
 1494                          * status, then finish the I/O ( which decrements the 
 1495                          * busy count and possibly wakes waiter's up ).
 1496                          */
 1497                         pmap_clear_modify(m);
 1498                         vm_page_undirty(m);
 1499                         vm_page_io_finish(m);
 1500                         if (vm_page_count_severe())
 1501                                 vm_page_try_to_cache(m);
 1502                 }
 1503         }
 1504         vm_page_unlock_queues();
 1505 
 1506         /*
 1507          * adjust pip.  NOTE: the original parent may still have its own
 1508          * pip refs on the object.
 1509          */
 1510         if (object != NULL) {
 1511                 vm_object_pip_wakeupn(object, bp->b_npages);
 1512                 VM_OBJECT_UNLOCK(object);
 1513         }
 1514 
 1515         /*
 1516          * release the physical I/O buffer
 1517          */
 1518         relpbuf(
 1519             bp, 
 1520             ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 
 1521                 ((bp->b_flags & B_ASYNC) ? 
 1522                     &nsw_wcount_async : 
 1523                     &nsw_wcount_sync
 1524                 )
 1525             )
 1526         );
 1527 }
 1528 
 1529 /*
 1530  *      swap_pager_isswapped:
 1531  *
 1532  *      Return 1 if at least one page in the given object is paged
 1533  *      out to the given swap device.
 1534  *
 1535  *      This routine may not block.
 1536  */
 1537 int
 1538 swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
 1539 {
 1540         daddr_t index = 0;
 1541         int bcount;
 1542         int i;
 1543 
 1544         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1545         if (object->type != OBJT_SWAP)
 1546                 return (0);
 1547 
 1548         mtx_lock(&swhash_mtx);
 1549         for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
 1550                 struct swblock *swap;
 1551 
 1552                 if ((swap = *swp_pager_hash(object, index)) != NULL) {
 1553                         for (i = 0; i < SWAP_META_PAGES; ++i) {
 1554                                 if (swp_pager_isondev(swap->swb_pages[i], sp)) {
 1555                                         mtx_unlock(&swhash_mtx);
 1556                                         return (1);
 1557                                 }
 1558                         }
 1559                 }
 1560                 index += SWAP_META_PAGES;
 1561                 if (index > 0x20000000)
 1562                         panic("swap_pager_isswapped: failed to locate all swap meta blocks");
 1563         }
 1564         mtx_unlock(&swhash_mtx);
 1565         return (0);
 1566 }
 1567 
 1568 /*
 1569  * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
 1570  *
 1571  *      This routine dissociates the page at the given index within a
 1572  *      swap block from its backing store, paging it in if necessary.
 1573  *      If the page is paged in, it is placed in the inactive queue,
 1574  *      since it had its backing store ripped out from under it.
 1575  *      We also attempt to swap in all other pages in the swap block,
 1576  *      we only guarantee that the one at the specified index is
 1577  *      paged in.
 1578  *
 1579  *      XXX - The code to page the whole block in doesn't work, so we
 1580  *            revert to the one-by-one behavior for now.  Sigh.
 1581  */
 1582 static __inline void
 1583 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
 1584 {
 1585         vm_page_t m;
 1586 
 1587         vm_object_pip_add(object, 1);
 1588         m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
 1589         if (m->valid == VM_PAGE_BITS_ALL) {
 1590                 vm_object_pip_subtract(object, 1);
 1591                 vm_page_lock_queues();
 1592                 vm_page_activate(m);
 1593                 vm_page_dirty(m);
 1594                 vm_page_wakeup(m);
 1595                 vm_page_unlock_queues();
 1596                 vm_pager_page_unswapped(m);
 1597                 return;
 1598         }
 1599 
 1600         if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
 1601                 panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
 1602         vm_object_pip_subtract(object, 1);
 1603         vm_page_lock_queues();
 1604         vm_page_dirty(m);
 1605         vm_page_dontneed(m);
 1606         vm_page_wakeup(m);
 1607         vm_page_unlock_queues();
 1608         vm_pager_page_unswapped(m);
 1609 }
 1610 
 1611 /*
 1612  *      swap_pager_swapoff:
 1613  *
 1614  *      Page in all of the pages that have been paged out to the
 1615  *      given device.  The corresponding blocks in the bitmap must be
 1616  *      marked as allocated and the device must be flagged SW_CLOSING.
 1617  *      There may be no processes swapped out to the device.
 1618  *
 1619  *      This routine may block.
 1620  */
 1621 static void
 1622 swap_pager_swapoff(struct swdevt *sp)
 1623 {
 1624         struct swblock *swap;
 1625         int i, j, retries;
 1626 
 1627         GIANT_REQUIRED;
 1628 
 1629         retries = 0;
 1630 full_rescan:
 1631         mtx_lock(&swhash_mtx);
 1632         for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 1633 restart:
 1634                 for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
 1635                         vm_object_t object = swap->swb_object;
 1636                         vm_pindex_t pindex = swap->swb_index;
 1637                         for (j = 0; j < SWAP_META_PAGES; ++j) {
 1638                                 if (swp_pager_isondev(swap->swb_pages[j], sp)) {
 1639                                         /* avoid deadlock */
 1640                                         if (!VM_OBJECT_TRYLOCK(object)) {
 1641                                                 break;
 1642                                         } else {
 1643                                                 mtx_unlock(&swhash_mtx);
 1644                                                 swp_pager_force_pagein(object,
 1645                                                     pindex + j);
 1646                                                 VM_OBJECT_UNLOCK(object);
 1647                                                 mtx_lock(&swhash_mtx);
 1648                                                 goto restart;
 1649                                         }
 1650                                 }
 1651                         }
 1652                 }
 1653         }
 1654         mtx_unlock(&swhash_mtx);
 1655         if (sp->sw_used) {
 1656                 int dummy;
 1657                 /*
 1658                  * Objects may be locked or paging to the device being
 1659                  * removed, so we will miss their pages and need to
 1660                  * make another pass.  We have marked this device as
 1661                  * SW_CLOSING, so the activity should finish soon.
 1662                  */
 1663                 retries++;
 1664                 if (retries > 100) {
 1665                         panic("swapoff: failed to locate %d swap blocks",
 1666                             sp->sw_used);
 1667                 }
 1668                 tsleep(&dummy, PVM, "swpoff", hz / 20);
 1669                 goto full_rescan;
 1670         }
 1671 }
 1672 
 1673 /************************************************************************
 1674  *                              SWAP META DATA                          *
 1675  ************************************************************************
 1676  *
 1677  *      These routines manipulate the swap metadata stored in the 
 1678  *      OBJT_SWAP object.  All swp_*() routines must be called at
 1679  *      splvm() because swap can be freed up by the low level vm_page
 1680  *      code which might be called from interrupts beyond what splbio() covers.
 1681  *
 1682  *      Swap metadata is implemented with a global hash and not directly
 1683  *      linked into the object.  Instead the object simply contains
 1684  *      appropriate tracking counters.
 1685  */
 1686 
 1687 /*
 1688  * SWP_PAGER_META_BUILD() -     add swap block to swap meta data for object
 1689  *
 1690  *      We first convert the object to a swap object if it is a default
 1691  *      object.
 1692  *
 1693  *      The specified swapblk is added to the object's swap metadata.  If
 1694  *      the swapblk is not valid, it is freed instead.  Any previously
 1695  *      assigned swapblk is freed.
 1696  *
 1697  *      This routine must be called at splvm(), except when used to convert
 1698  *      an OBJT_DEFAULT object into an OBJT_SWAP object.
 1699  */
 1700 static void
 1701 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 1702 {
 1703         struct swblock *swap;
 1704         struct swblock **pswap;
 1705         int idx;
 1706 
 1707         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1708         /*
 1709          * Convert default object to swap object if necessary
 1710          */
 1711         if (object->type != OBJT_SWAP) {
 1712                 object->type = OBJT_SWAP;
 1713                 object->un_pager.swp.swp_bcount = 0;
 1714 
 1715                 if (object->handle != NULL) {
 1716                         mtx_lock(&sw_alloc_mtx);
 1717                         TAILQ_INSERT_TAIL(
 1718                             NOBJLIST(object->handle),
 1719                             object, 
 1720                             pager_object_list
 1721                         );
 1722                         mtx_unlock(&sw_alloc_mtx);
 1723                 }
 1724         }
 1725         
 1726         /*
 1727          * Locate hash entry.  If not found create, but if we aren't adding
 1728          * anything just return.  If we run out of space in the map we wait
 1729          * and, since the hash table may have changed, retry.
 1730          */
 1731 retry:
 1732         mtx_lock(&swhash_mtx);
 1733         pswap = swp_pager_hash(object, pindex);
 1734 
 1735         if ((swap = *pswap) == NULL) {
 1736                 int i;
 1737 
 1738                 if (swapblk == SWAPBLK_NONE)
 1739                         goto done;
 1740 
 1741                 swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
 1742                 if (swap == NULL) {
 1743                         mtx_unlock(&swhash_mtx);
 1744                         VM_OBJECT_UNLOCK(object);
 1745                         VM_WAIT;
 1746                         VM_OBJECT_LOCK(object);
 1747                         goto retry;
 1748                 }
 1749 
 1750                 swap->swb_hnext = NULL;
 1751                 swap->swb_object = object;
 1752                 swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
 1753                 swap->swb_count = 0;
 1754 
 1755                 ++object->un_pager.swp.swp_bcount;
 1756 
 1757                 for (i = 0; i < SWAP_META_PAGES; ++i)
 1758                         swap->swb_pages[i] = SWAPBLK_NONE;
 1759         }
 1760 
 1761         /*
 1762          * Delete prior contents of metadata
 1763          */
 1764         idx = pindex & SWAP_META_MASK;
 1765 
 1766         if (swap->swb_pages[idx] != SWAPBLK_NONE) {
 1767                 swp_pager_freeswapspace(swap->swb_pages[idx], 1);
 1768                 --swap->swb_count;
 1769         }
 1770 
 1771         /*
 1772          * Enter block into metadata
 1773          */
 1774         swap->swb_pages[idx] = swapblk;
 1775         if (swapblk != SWAPBLK_NONE)
 1776                 ++swap->swb_count;
 1777 done:
 1778         mtx_unlock(&swhash_mtx);
 1779 }
 1780 
 1781 /*
 1782  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
 1783  *
 1784  *      The requested range of blocks is freed, with any associated swap 
 1785  *      returned to the swap bitmap.
 1786  *
 1787  *      This routine will free swap metadata structures as they are cleaned 
 1788  *      out.  This routine does *NOT* operate on swap metadata associated
 1789  *      with resident pages.
 1790  *
 1791  *      This routine must be called at splvm()
 1792  */
 1793 static void
 1794 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 1795 {
 1796 
 1797         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1798         if (object->type != OBJT_SWAP)
 1799                 return;
 1800 
 1801         while (count > 0) {
 1802                 struct swblock **pswap;
 1803                 struct swblock *swap;
 1804 
 1805                 mtx_lock(&swhash_mtx);
 1806                 pswap = swp_pager_hash(object, index);
 1807 
 1808                 if ((swap = *pswap) != NULL) {
 1809                         daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 1810 
 1811                         if (v != SWAPBLK_NONE) {
 1812                                 swp_pager_freeswapspace(v, 1);
 1813                                 swap->swb_pages[index & SWAP_META_MASK] =
 1814                                         SWAPBLK_NONE;
 1815                                 if (--swap->swb_count == 0) {
 1816                                         *pswap = swap->swb_hnext;
 1817                                         uma_zfree(swap_zone, swap);
 1818                                         --object->un_pager.swp.swp_bcount;
 1819                                 }
 1820                         }
 1821                         --count;
 1822                         ++index;
 1823                 } else {
 1824                         int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
 1825                         count -= n;
 1826                         index += n;
 1827                 }
 1828                 mtx_unlock(&swhash_mtx);
 1829         }
 1830 }
 1831 
 1832 /*
 1833  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
 1834  *
 1835  *      This routine locates and destroys all swap metadata associated with
 1836  *      an object.
 1837  *
 1838  *      This routine must be called at splvm()
 1839  */
 1840 static void
 1841 swp_pager_meta_free_all(vm_object_t object)
 1842 {
 1843         daddr_t index = 0;
 1844 
 1845         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1846         if (object->type != OBJT_SWAP)
 1847                 return;
 1848 
 1849         while (object->un_pager.swp.swp_bcount) {
 1850                 struct swblock **pswap;
 1851                 struct swblock *swap;
 1852 
 1853                 mtx_lock(&swhash_mtx);
 1854                 pswap = swp_pager_hash(object, index);
 1855                 if ((swap = *pswap) != NULL) {
 1856                         int i;
 1857 
 1858                         for (i = 0; i < SWAP_META_PAGES; ++i) {
 1859                                 daddr_t v = swap->swb_pages[i];
 1860                                 if (v != SWAPBLK_NONE) {
 1861                                         --swap->swb_count;
 1862                                         swp_pager_freeswapspace(v, 1);
 1863                                 }
 1864                         }
 1865                         if (swap->swb_count != 0)
 1866                                 panic("swap_pager_meta_free_all: swb_count != 0");
 1867                         *pswap = swap->swb_hnext;
 1868                         uma_zfree(swap_zone, swap);
 1869                         --object->un_pager.swp.swp_bcount;
 1870                 }
 1871                 mtx_unlock(&swhash_mtx);
 1872                 index += SWAP_META_PAGES;
 1873                 if (index > 0x20000000)
 1874                         panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
 1875         }
 1876 }
 1877 
 1878 /*
 1879  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
 1880  *
 1881  *      This routine is capable of looking up, popping, or freeing
 1882  *      swapblk assignments in the swap meta data or in the vm_page_t.
 1883  *      The routine typically returns the swapblk being looked-up, or popped,
 1884  *      or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
 1885  *      was invalid.  This routine will automatically free any invalid 
 1886  *      meta-data swapblks.
 1887  *
 1888  *      It is not possible to store invalid swapblks in the swap meta data
 1889  *      (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
 1890  *
 1891  *      When acting on a busy resident page and paging is in progress, we 
 1892  *      have to wait until paging is complete but otherwise can act on the 
 1893  *      busy page.
 1894  *
 1895  *      This routine must be called at splvm().
 1896  *
 1897  *      SWM_FREE        remove and free swap block from metadata
 1898  *      SWM_POP         remove from meta data but do not free.. pop it out
 1899  */
 1900 static daddr_t
 1901 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 1902 {
 1903         struct swblock **pswap;
 1904         struct swblock *swap;
 1905         daddr_t r1;
 1906         int idx;
 1907 
 1908         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1909         /*
 1910          * The meta data only exists of the object is OBJT_SWAP 
 1911          * and even then might not be allocated yet.
 1912          */
 1913         if (object->type != OBJT_SWAP)
 1914                 return (SWAPBLK_NONE);
 1915 
 1916         r1 = SWAPBLK_NONE;
 1917         mtx_lock(&swhash_mtx);
 1918         pswap = swp_pager_hash(object, pindex);
 1919 
 1920         if ((swap = *pswap) != NULL) {
 1921                 idx = pindex & SWAP_META_MASK;
 1922                 r1 = swap->swb_pages[idx];
 1923 
 1924                 if (r1 != SWAPBLK_NONE) {
 1925                         if (flags & SWM_FREE) {
 1926                                 swp_pager_freeswapspace(r1, 1);
 1927                                 r1 = SWAPBLK_NONE;
 1928                         }
 1929                         if (flags & (SWM_FREE|SWM_POP)) {
 1930                                 swap->swb_pages[idx] = SWAPBLK_NONE;
 1931                                 if (--swap->swb_count == 0) {
 1932                                         *pswap = swap->swb_hnext;
 1933                                         uma_zfree(swap_zone, swap);
 1934                                         --object->un_pager.swp.swp_bcount;
 1935                                 }
 1936                         } 
 1937                 }
 1938         }
 1939         mtx_unlock(&swhash_mtx);
 1940         return (r1);
 1941 }
 1942 
 1943 /*
 1944  * System call swapon(name) enables swapping on device name,
 1945  * which must be in the swdevsw.  Return EBUSY
 1946  * if already swapping on this device.
 1947  */
 1948 #ifndef _SYS_SYSPROTO_H_
 1949 struct swapon_args {
 1950         char *name;
 1951 };
 1952 #endif
 1953 
 1954 /* 
 1955  * MPSAFE
 1956  */
 1957 /* ARGSUSED */
 1958 int
 1959 swapon(struct thread *td, struct swapon_args *uap)
 1960 {
 1961         struct vattr attr;
 1962         struct vnode *vp;
 1963         struct nameidata nd;
 1964         int error;
 1965 
 1966         mtx_lock(&Giant);
 1967         error = suser(td);
 1968         if (error)
 1969                 goto done2;
 1970 
 1971         while (swdev_syscall_active)
 1972             tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 1973         swdev_syscall_active = 1;
 1974 
 1975         /*
 1976          * Swap metadata may not fit in the KVM if we have physical
 1977          * memory of >1GB.
 1978          */
 1979         if (swap_zone == NULL) {
 1980                 error = ENOMEM;
 1981                 goto done;
 1982         }
 1983 
 1984         NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW, UIO_USERSPACE, uap->name, td);
 1985         error = namei(&nd);
 1986         if (error)
 1987                 goto done;
 1988 
 1989         NDFREE(&nd, NDF_ONLY_PNBUF);
 1990         vp = nd.ni_vp;
 1991 
 1992         if (vn_isdisk(vp, &error)) {
 1993                 error = swapongeom(td, vp);
 1994         } else if (vp->v_type == VREG &&
 1995             (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 1996             (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) {
 1997                 /*
 1998                  * Allow direct swapping to NFS regular files in the same
 1999                  * way that nfs_mountroot() sets up diskless swapping.
 2000                  */
 2001                 error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 2002         }
 2003 
 2004         if (error)
 2005                 vrele(vp);
 2006 done:
 2007         swdev_syscall_active = 0;
 2008         wakeup_one(&swdev_syscall_active);
 2009 done2:
 2010         mtx_unlock(&Giant);
 2011         return (error);
 2012 }
 2013 
 2014 static void
 2015 swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
 2016 {
 2017         struct swdevt *sp, *tsp;
 2018         swblk_t dvbase;
 2019         u_long mblocks;
 2020 
 2021         /*
 2022          * If we go beyond this, we get overflows in the radix
 2023          * tree bitmap code.
 2024          */
 2025         mblocks = 0x40000000 / BLIST_META_RADIX;
 2026         if (nblks > mblocks) {
 2027                 printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n",
 2028                         mblocks);
 2029                 nblks = mblocks;
 2030         }
 2031         /*
 2032          * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 2033          * First chop nblks off to page-align it, then convert.
 2034          * 
 2035          * sw->sw_nblks is in page-sized chunks now too.
 2036          */
 2037         nblks &= ~(ctodb(1) - 1);
 2038         nblks = dbtoc(nblks);
 2039 
 2040         sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 2041         sp->sw_vp = vp;
 2042         sp->sw_id = id;
 2043         sp->sw_dev = dev;
 2044         sp->sw_flags = 0;
 2045         sp->sw_nblks = nblks;
 2046         sp->sw_used = 0;
 2047         sp->sw_strategy = strategy;
 2048         sp->sw_close = close;
 2049 
 2050         sp->sw_blist = blist_create(nblks);
 2051         /*
 2052          * Do not free the first two block in order to avoid overwriting
 2053          * any bsd label at the front of the partition
 2054          */
 2055         blist_free(sp->sw_blist, 2, nblks - 2);
 2056 
 2057         dvbase = 0;
 2058         mtx_lock(&sw_dev_mtx);
 2059         TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 2060                 if (tsp->sw_end >= dvbase) {
 2061                         /*
 2062                          * We put one uncovered page between the devices
 2063                          * in order to definitively prevent any cross-device
 2064                          * I/O requests
 2065                          */
 2066                         dvbase = tsp->sw_end + 1;
 2067                 }
 2068         }
 2069         sp->sw_first = dvbase;
 2070         sp->sw_end = dvbase + nblks;
 2071         TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 2072         nswapdev++;
 2073         swap_pager_avail += nblks;
 2074         swp_sizecheck();
 2075         mtx_unlock(&sw_dev_mtx);
 2076 }
 2077 
 2078 /*
 2079  * SYSCALL: swapoff(devname)
 2080  *
 2081  * Disable swapping on the given device.
 2082  *
 2083  * XXX: Badly designed system call: it should use a device index
 2084  * rather than filename as specification.  We keep sw_vp around
 2085  * only to make this work.
 2086  */
 2087 #ifndef _SYS_SYSPROTO_H_
 2088 struct swapoff_args {
 2089         char *name;
 2090 };
 2091 #endif
 2092 
 2093 /*
 2094  * MPSAFE
 2095  */
 2096 /* ARGSUSED */
 2097 int
 2098 swapoff(struct thread *td, struct swapoff_args *uap)
 2099 {
 2100         struct vnode *vp;
 2101         struct nameidata nd;
 2102         struct swdevt *sp;
 2103         u_long nblks, dvbase;
 2104         int error;
 2105 
 2106         mtx_lock(&Giant);
 2107 
 2108         error = suser(td);
 2109         if (error)
 2110                 goto done2;
 2111 
 2112         while (swdev_syscall_active)
 2113             tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 2114         swdev_syscall_active = 1;
 2115 
 2116         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
 2117         error = namei(&nd);
 2118         if (error)
 2119                 goto done;
 2120         NDFREE(&nd, NDF_ONLY_PNBUF);
 2121         vp = nd.ni_vp;
 2122 
 2123         mtx_lock(&sw_dev_mtx);
 2124         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2125                 if (sp->sw_vp == vp)
 2126                         goto found;
 2127         }
 2128         mtx_unlock(&sw_dev_mtx);
 2129         error = EINVAL;
 2130         goto done;
 2131 found:
 2132         mtx_unlock(&sw_dev_mtx);
 2133 #ifdef MAC
 2134         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2135         error = mac_check_system_swapoff(td->td_ucred, vp);
 2136         (void) VOP_UNLOCK(vp, 0, td);
 2137         if (error != 0)
 2138                 goto done;
 2139 #endif
 2140         
 2141         nblks = sp->sw_nblks;
 2142 
 2143         /*
 2144          * We can turn off this swap device safely only if the
 2145          * available virtual memory in the system will fit the amount
 2146          * of data we will have to page back in, plus an epsilon so
 2147          * the system doesn't become critically low on swap space.
 2148          */
 2149         if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
 2150             nblks + nswap_lowat) {
 2151                 error = ENOMEM;
 2152                 goto done;
 2153         }
 2154 
 2155         /*
 2156          * Prevent further allocations on this device.
 2157          */
 2158         mtx_lock(&sw_dev_mtx);
 2159         sp->sw_flags |= SW_CLOSING;
 2160         for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
 2161                 swap_pager_avail -= blist_fill(sp->sw_blist,
 2162                      dvbase, dmmax);
 2163         }
 2164         mtx_unlock(&sw_dev_mtx);
 2165 
 2166         /*
 2167          * Page in the contents of the device and close it.
 2168          */
 2169         swap_pager_swapoff(sp);
 2170 
 2171         sp->sw_close(td, sp);
 2172         sp->sw_id = NULL;
 2173         mtx_lock(&sw_dev_mtx);
 2174         TAILQ_REMOVE(&swtailq, sp, sw_list);
 2175         nswapdev--;
 2176         if (nswapdev == 0) {
 2177                 swap_pager_full = 2;
 2178                 swap_pager_almost_full = 1;
 2179         }
 2180         if (swdevhd == sp)
 2181                 swdevhd = NULL;
 2182         mtx_unlock(&sw_dev_mtx);
 2183         blist_destroy(sp->sw_blist);
 2184         free(sp, M_VMPGDATA);
 2185 
 2186 done:
 2187         swdev_syscall_active = 0;
 2188         wakeup_one(&swdev_syscall_active);
 2189 done2:
 2190         mtx_unlock(&Giant);
 2191         return (error);
 2192 }
 2193 
 2194 void
 2195 swap_pager_status(int *total, int *used)
 2196 {
 2197         struct swdevt *sp;
 2198 
 2199         *total = 0;
 2200         *used = 0;
 2201         mtx_lock(&sw_dev_mtx);
 2202         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2203                 *total += sp->sw_nblks;
 2204                 *used += sp->sw_used;
 2205         }
 2206         mtx_unlock(&sw_dev_mtx);
 2207 }
 2208 
 2209 static int
 2210 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 2211 {
 2212         int     *name = (int *)arg1;
 2213         int     error, n;
 2214         struct xswdev xs;
 2215         struct swdevt *sp;
 2216 
 2217         if (arg2 != 1) /* name length */
 2218                 return (EINVAL);
 2219 
 2220         n = 0;
 2221         mtx_lock(&sw_dev_mtx);
 2222         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2223                 if (n == *name) {
 2224                         mtx_unlock(&sw_dev_mtx);
 2225                         xs.xsw_version = XSWDEV_VERSION;
 2226                         xs.xsw_dev = sp->sw_dev;
 2227                         xs.xsw_flags = sp->sw_flags;
 2228                         xs.xsw_nblks = sp->sw_nblks;
 2229                         xs.xsw_used = sp->sw_used;
 2230 
 2231                         error = SYSCTL_OUT(req, &xs, sizeof(xs));
 2232                         return (error);
 2233                 }
 2234                 n++;
 2235         }
 2236         mtx_unlock(&sw_dev_mtx);
 2237         return (ENOENT);
 2238 }
 2239 
 2240 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
 2241     "Number of swap devices");
 2242 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
 2243     "Swap statistics by device");
 2244 
 2245 /*
 2246  * vmspace_swap_count() - count the approximate swap useage in pages for a
 2247  *                        vmspace.
 2248  *
 2249  *      The map must be locked.
 2250  *
 2251  *      Swap useage is determined by taking the proportional swap used by
 2252  *      VM objects backing the VM map.  To make up for fractional losses,
 2253  *      if the VM object has any swap use at all the associated map entries
 2254  *      count for at least 1 swap page.
 2255  */
 2256 int
 2257 vmspace_swap_count(struct vmspace *vmspace)
 2258 {
 2259         vm_map_t map = &vmspace->vm_map;
 2260         vm_map_entry_t cur;
 2261         int count = 0;
 2262 
 2263         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 2264                 vm_object_t object;
 2265 
 2266                 if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 2267                     (object = cur->object.vm_object) != NULL) {
 2268                         VM_OBJECT_LOCK(object);
 2269                         if (object->type == OBJT_SWAP &&
 2270                             object->un_pager.swp.swp_bcount != 0) {
 2271                                 int n = (cur->end - cur->start) / PAGE_SIZE;
 2272 
 2273                                 count += object->un_pager.swp.swp_bcount *
 2274                                     SWAP_META_PAGES * n / object->size + 1;
 2275                         }
 2276                         VM_OBJECT_UNLOCK(object);
 2277                 }
 2278         }
 2279         return (count);
 2280 }
 2281 
 2282 /*
 2283  * GEOM backend
 2284  *
 2285  * Swapping onto disk devices.
 2286  *
 2287  */
 2288 
 2289 static g_orphan_t swapgeom_orphan;
 2290 
 2291 static struct g_class g_swap_class = {
 2292         .name = "SWAP",
 2293         .version = G_VERSION,
 2294         .orphan = swapgeom_orphan,
 2295 };
 2296 
 2297 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 2298 
 2299 
 2300 static void
 2301 swapgeom_done(struct bio *bp2)
 2302 {
 2303         struct buf *bp;
 2304 
 2305         bp = bp2->bio_caller2;
 2306         bp->b_ioflags = bp2->bio_flags;
 2307         if (bp2->bio_error)
 2308                 bp->b_ioflags |= BIO_ERROR;
 2309         bp->b_resid = bp->b_bcount - bp2->bio_completed;
 2310         bp->b_error = bp2->bio_error;
 2311         bufdone(bp);
 2312         g_destroy_bio(bp2);
 2313 }
 2314 
 2315 static void
 2316 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 2317 {
 2318         struct bio *bio;
 2319         struct g_consumer *cp;
 2320 
 2321         cp = sp->sw_id;
 2322         if (cp == NULL) {
 2323                 bp->b_error = ENXIO;
 2324                 bp->b_ioflags |= BIO_ERROR;
 2325                 bufdone(bp);
 2326                 return;
 2327         }
 2328         bio = g_alloc_bio();
 2329 #if 0
 2330         /*
 2331          * XXX: We shouldn't really sleep here when we run out of buffers
 2332          * XXX: but the alternative is worse right now.
 2333          */
 2334         if (bio == NULL) {
 2335                 bp->b_error = ENOMEM;
 2336                 bp->b_ioflags |= BIO_ERROR;
 2337                 bufdone(bp);
 2338                 return;
 2339         }
 2340 #endif
 2341         bio->bio_caller2 = bp;
 2342         bio->bio_cmd = bp->b_iocmd;
 2343         bio->bio_data = bp->b_data;
 2344         bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 2345         bio->bio_length = bp->b_bcount;
 2346         bio->bio_done = swapgeom_done;
 2347         g_io_request(bio, cp);
 2348         return;
 2349 }
 2350 
 2351 static void
 2352 swapgeom_orphan(struct g_consumer *cp)
 2353 {
 2354         struct swdevt *sp;
 2355 
 2356         mtx_lock(&sw_dev_mtx);
 2357         TAILQ_FOREACH(sp, &swtailq, sw_list)
 2358                 if (sp->sw_id == cp)
 2359                         sp->sw_id = NULL;
 2360         mtx_unlock(&sw_dev_mtx);
 2361 }
 2362 
 2363 static void
 2364 swapgeom_close_ev(void *arg, int flags)
 2365 {
 2366         struct g_consumer *cp;
 2367 
 2368         cp = arg;
 2369         g_access(cp, -1, -1, 0);
 2370         g_detach(cp);
 2371         g_destroy_consumer(cp);
 2372 }
 2373 
 2374 static void
 2375 swapgeom_close(struct thread *td, struct swdevt *sw)
 2376 {
 2377 
 2378         /* XXX: direct call when Giant untangled */
 2379         g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
 2380 }
 2381 
 2382 
 2383 struct swh0h0 {
 2384         struct cdev *dev;
 2385         struct vnode *vp;
 2386         int     error;
 2387 };
 2388 
 2389 static void
 2390 swapongeom_ev(void *arg, int flags)
 2391 {
 2392         struct swh0h0 *swh;
 2393         struct g_provider *pp;
 2394         struct g_consumer *cp;
 2395         static struct g_geom *gp;
 2396         struct swdevt *sp;
 2397         u_long nblks;
 2398         int error;
 2399 
 2400         swh = arg;
 2401         swh->error = 0;
 2402         pp = g_dev_getprovider(swh->dev);
 2403         if (pp == NULL) {
 2404                 swh->error = ENODEV;
 2405                 return;
 2406         }
 2407         mtx_lock(&sw_dev_mtx);
 2408         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2409                 cp = sp->sw_id;
 2410                 if (cp != NULL && cp->provider == pp) {
 2411                         mtx_unlock(&sw_dev_mtx);
 2412                         swh->error = EBUSY;
 2413                         return;
 2414                 }
 2415         }
 2416         mtx_unlock(&sw_dev_mtx);
 2417         if (gp == NULL)
 2418                 gp = g_new_geomf(&g_swap_class, "swap", NULL);
 2419         cp = g_new_consumer(gp);
 2420         g_attach(cp, pp);
 2421         /*
 2422          * XXX: Everytime you think you can improve the margin for
 2423          * footshooting, somebody depends on the ability to do so:
 2424          * savecore(8) wants to write to our swapdev so we cannot
 2425          * set an exclusive count :-(
 2426          */
 2427         error = g_access(cp, 1, 1, 0);
 2428         if (error) {
 2429                 g_detach(cp);
 2430                 g_destroy_consumer(cp);
 2431                 swh->error = error;
 2432                 return;
 2433         }
 2434         nblks = pp->mediasize / DEV_BSIZE;
 2435         swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
 2436             swapgeom_close, dev2udev(swh->dev));
 2437         swh->error = 0;
 2438         return;
 2439 }
 2440 
 2441 static int
 2442 swapongeom(struct thread *td, struct vnode *vp)
 2443 {
 2444         int error;
 2445         struct swh0h0 swh;
 2446 
 2447         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2448 
 2449         swh.dev = vp->v_rdev;
 2450         swh.vp = vp;
 2451         swh.error = 0;
 2452         /* XXX: direct call when Giant untangled */
 2453         error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
 2454         if (!error)
 2455                 error = swh.error;
 2456         VOP_UNLOCK(vp, 0, td);
 2457         return (error);
 2458 }
 2459 
 2460 /*
 2461  * VNODE backend
 2462  *
 2463  * This is used mainly for network filesystem (read: probably only tested
 2464  * with NFS) swapfiles.
 2465  *
 2466  */
 2467 
 2468 static void
 2469 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 2470 {
 2471         struct vnode *vp2;
 2472 
 2473         bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 2474 
 2475         vp2 = sp->sw_id;
 2476         vhold(vp2);
 2477         if (bp->b_iocmd == BIO_WRITE) {
 2478                 if (bp->b_bufobj) /* XXX: should always be true /phk */
 2479                         bufobj_wdrop(bp->b_bufobj);
 2480                 bufobj_wref(&vp2->v_bufobj);
 2481         }
 2482         bp->b_vp = vp2;
 2483         bp->b_iooffset = dbtob(bp->b_blkno);
 2484         bstrategy(bp);
 2485         return;
 2486 }
 2487 
 2488 static void
 2489 swapdev_close(struct thread *td, struct swdevt *sp)
 2490 {
 2491 
 2492         VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 2493         vrele(sp->sw_vp);
 2494 }
 2495 
 2496 
 2497 static int
 2498 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 2499 {
 2500         struct swdevt *sp;
 2501         int error;
 2502 
 2503         if (nblks == 0)
 2504                 return (ENXIO);
 2505         mtx_lock(&sw_dev_mtx);
 2506         TAILQ_FOREACH(sp, &swtailq, sw_list) {
 2507                 if (sp->sw_id == vp) {
 2508                         mtx_unlock(&sw_dev_mtx);
 2509                         return (EBUSY);
 2510                 }
 2511         }
 2512         mtx_unlock(&sw_dev_mtx);
 2513     
 2514         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2515 #ifdef MAC
 2516         error = mac_check_system_swapon(td->td_ucred, vp);
 2517         if (error == 0)
 2518 #endif
 2519                 error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, -1);
 2520         (void) VOP_UNLOCK(vp, 0, td);
 2521         if (error)
 2522                 return (error);
 2523 
 2524         swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 2525             NODEV);
 2526         return (0);
 2527 }

Cache object: 295c9c4dc0f8e9985500abf5e739d991


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.