The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_swap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  * 3. All advertising materials mentioning features or use of this software
   14  *    must display the following acknowledgement:
   15  *      This product includes software developed by the University of
   16  *      California, Berkeley and its contributors.
   17  * 4. Neither the name of the University nor the names of its contributors
   18  *    may be used to endorse or promote products derived from this software
   19  *    without specific prior written permission.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31  * SUCH DAMAGE.
   32  *
   33  *      @(#)vm_swap.c   8.5 (Berkeley) 2/17/94
   34  * $FreeBSD: releng/5.1/sys/vm/vm_swap.c 111936 2003-03-05 23:50:15Z rwatson $
   35  */
   36 
   37 #include "opt_mac.h"
   38 #include "opt_swap.h"
   39 #include "opt_vm.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/systm.h>
   43 #include <sys/sysproto.h>
   44 #include <sys/bio.h>
   45 #include <sys/buf.h>
   46 #include <sys/proc.h>
   47 #include <sys/namei.h>
   48 #include <sys/vnode.h>
   49 #include <sys/fcntl.h>
   50 #include <sys/blist.h>
   51 #include <sys/kernel.h>
   52 #include <sys/lock.h>
   53 #include <sys/conf.h>
   54 #include <sys/disk.h>
   55 #include <sys/stat.h>
   56 #include <sys/sysctl.h>
   57 #include <sys/mac.h>
   58 #include <sys/mount.h>
   59 #include <vm/vm.h>
   60 #include <vm/vm_extern.h>
   61 #include <vm/vm_param.h>
   62 #include <vm/vm_pageout.h>
   63 #include <vm/swap_pager.h>
   64 #include <vm/uma.h>
   65 
   66 /*
   67  * Indirect driver for multi-controller paging.
   68  */
   69 
   70 typedef int32_t swblk_t;        /* swap offset */
   71 
   72 #ifndef NSWAPDEV
   73 #define NSWAPDEV        4
   74 #endif
   75 static struct swdevt should_be_malloced[NSWAPDEV];
   76 struct swdevt *swdevt = should_be_malloced;
   77 static int nswap;               /* first block after the interleaved devs */
   78 int nswdev = NSWAPDEV;
   79 int vm_swap_size;
   80 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
   81 
   82 
   83 static int swapdev_strategy(struct vop_strategy_args *ap);
   84 struct vnode *swapdev_vp;
   85 
   86 /*
   87  *      swapdev_strategy:
   88  *
   89  *      VOP_STRATEGY() for swapdev_vp.
   90  *      Perform swap strategy interleave device selection.
   91  *
   92  *      The bp is expected to be locked and *not* B_DONE on call.
   93  */
   94 static int
   95 swapdev_strategy(ap)
   96         struct vop_strategy_args /* {
   97                 struct vnode *a_vp;
   98                 struct buf *a_bp;
   99         } */ *ap;
  100 {
  101         int s, sz, off, seg, index;
  102         struct swdevt *sp;
  103         struct vnode *vp;
  104         struct buf *bp;
  105 
  106         bp = ap->a_bp;
  107         sz = howmany(bp->b_bcount, PAGE_SIZE);
  108 
  109         /*
  110          * Convert interleaved swap into per-device swap.  Note that
  111          * the block size is left in PAGE_SIZE'd chunks (for the newswap)
  112          * here.
  113          */
  114         if (nswdev > 1) {
  115                 off = bp->b_blkno % dmmax;
  116                 if (off + sz > dmmax) {
  117                         bp->b_error = EINVAL;
  118                         bp->b_ioflags |= BIO_ERROR;
  119                         bufdone(bp);
  120                         return 0;
  121                 }
  122                 seg = bp->b_blkno / dmmax;
  123                 index = seg % nswdev;
  124                 seg /= nswdev;
  125                 bp->b_blkno = seg * dmmax + off;
  126         } else {
  127                 index = 0;
  128         }
  129         sp = &swdevt[index];
  130         if (bp->b_blkno + sz > sp->sw_nblks) {
  131                 bp->b_error = EINVAL;
  132                 bp->b_ioflags |= BIO_ERROR;
  133                 bufdone(bp);
  134                 return 0;
  135         }
  136         bp->b_dev = sp->sw_device;
  137         if (sp->sw_vp == NULL) {
  138                 bp->b_error = ENODEV;
  139                 bp->b_ioflags |= BIO_ERROR;
  140                 bufdone(bp);
  141                 return 0;
  142         }
  143 
  144         /*
  145          * Convert from PAGE_SIZE'd to DEV_BSIZE'd chunks for the actual I/O
  146          */
  147         bp->b_blkno = ctodb(bp->b_blkno);
  148 
  149         vhold(sp->sw_vp);
  150         s = splvm();
  151         if (bp->b_iocmd == BIO_WRITE) {
  152                 vp = bp->b_vp;
  153                 if (vp) {
  154                         VI_LOCK(vp);
  155                         vp->v_numoutput--;
  156                         if ((vp->v_iflag & VI_BWAIT) && vp->v_numoutput <= 0) {
  157                                 vp->v_iflag &= ~VI_BWAIT;
  158                                 wakeup(&vp->v_numoutput);
  159                         }
  160                         VI_UNLOCK(vp);
  161                 }
  162                 VI_LOCK(sp->sw_vp);
  163                 sp->sw_vp->v_numoutput++;
  164                 VI_UNLOCK(sp->sw_vp);
  165         }
  166         bp->b_vp = sp->sw_vp;
  167         splx(s);
  168         if (bp->b_vp->v_type == VCHR)
  169                 VOP_SPECSTRATEGY(bp->b_vp, bp);
  170         else
  171                 VOP_STRATEGY(bp->b_vp, bp);
  172         return 0;
  173 }
  174 
  175 /*
  176  * Create a special vnode op vector for swapdev_vp - we only use
  177  * VOP_STRATEGY() and reclaim; everything else returns an error.
  178  */
  179 vop_t **swapdev_vnodeop_p;
  180 static struct vnodeopv_entry_desc swapdev_vnodeop_entries[] = {  
  181         { &vop_default_desc,            (vop_t *) vop_defaultop },
  182         { &vop_reclaim_desc,            (vop_t *) vop_null },
  183         { &vop_strategy_desc,           (vop_t *) swapdev_strategy },
  184         { NULL, NULL }
  185 };
  186 static struct vnodeopv_desc swapdev_vnodeop_opv_desc =
  187         { &swapdev_vnodeop_p, swapdev_vnodeop_entries };
  188 
  189 VNODEOP_SET(swapdev_vnodeop_opv_desc);
  190 
  191 /*
  192  * System call swapon(name) enables swapping on device name,
  193  * which must be in the swdevsw.  Return EBUSY
  194  * if already swapping on this device.
  195  */
  196 #ifndef _SYS_SYSPROTO_H_
  197 struct swapon_args {
  198         char *name;
  199 };
  200 #endif
  201 
  202 /* 
  203  * MPSAFE
  204  */
  205 /* ARGSUSED */
  206 int
  207 swapon(td, uap)
  208         struct thread *td;
  209         struct swapon_args *uap;
  210 {
  211         struct vattr attr;
  212         struct vnode *vp;
  213         struct nameidata nd;
  214         int error;
  215 
  216         mtx_lock(&Giant);
  217         error = suser(td);
  218         if (error)
  219                 goto done2;
  220 
  221         while (swdev_syscall_active)
  222             tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
  223         swdev_syscall_active = 1;
  224 
  225         /*
  226          * Swap metadata may not fit in the KVM if we have physical
  227          * memory of >1GB.
  228          */
  229         if (swap_zone == NULL) {
  230                 error = ENOMEM;
  231                 goto done;
  232         }
  233 
  234         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
  235         error = namei(&nd);
  236         if (error)
  237                 goto done;
  238 
  239         NDFREE(&nd, NDF_ONLY_PNBUF);
  240         vp = nd.ni_vp;
  241 
  242         if (vn_isdisk(vp, &error))
  243                 error = swaponvp(td, vp, vp->v_rdev, 0);
  244         else if (vp->v_type == VREG &&
  245             (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
  246             (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) {
  247                 /*
  248                  * Allow direct swapping to NFS regular files in the same
  249                  * way that nfs_mountroot() sets up diskless swapping.
  250                  */
  251                 error = swaponvp(td, vp, NODEV, attr.va_size / DEV_BSIZE);
  252         }
  253 
  254         if (error)
  255                 vrele(vp);
  256 done:
  257         swdev_syscall_active = 0;
  258         wakeup_one(&swdev_syscall_active);
  259 done2:
  260         mtx_unlock(&Giant);
  261         return (error);
  262 }
  263 
  264 /*
  265  * Swfree(index) frees the index'th portion of the swap map.
  266  * Each of the nswdev devices provides 1/nswdev'th of the swap
  267  * space, which is laid out with blocks of dmmax pages circularly
  268  * among the devices.
  269  *
  270  * The new swap code uses page-sized blocks.  The old swap code used
  271  * DEV_BSIZE'd chunks.
  272  */
  273 int
  274 swaponvp(td, vp, dev, nblks)
  275         struct thread *td;
  276         struct vnode *vp;
  277         dev_t dev;
  278         u_long nblks;
  279 {
  280         int index;
  281         struct swdevt *sp;
  282         swblk_t vsbase;
  283         long blk;
  284         swblk_t dvbase;
  285         int error;
  286         u_long aligned_nblks;
  287         off_t mediasize;
  288 
  289         if (!swapdev_vp) {
  290                 error = getnewvnode("none", NULL, swapdev_vnodeop_p,
  291                     &swapdev_vp);
  292                 if (error)
  293                         panic("Cannot get vnode for swapdev");
  294                 swapdev_vp->v_type = VNON;      /* Untyped */
  295         }
  296 
  297         ASSERT_VOP_UNLOCKED(vp, "swaponvp");
  298         for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) {
  299                 if (sp->sw_vp == vp)
  300                         return EBUSY;
  301                 if (!sp->sw_vp)
  302                         goto found;
  303 
  304         }
  305         return EINVAL;
  306     found:
  307         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  308 #ifdef MAC
  309         error = mac_check_system_swapon(td->td_ucred, vp);
  310         if (error == 0)
  311 #endif
  312                 error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td);
  313         (void) VOP_UNLOCK(vp, 0, td);
  314         if (error)
  315                 return (error);
  316 
  317         error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
  318             FREAD, td->td_ucred, td);
  319         if (error == 0)
  320             nblks = mediasize / DEV_BSIZE;
  321         else
  322             nblks = 0;
  323         /*
  324          * XXX: We should also check that the sectorsize makes sense
  325          * XXX: it should be a power of two, no larger than the page size.
  326          */
  327         if (nblks == 0) {
  328                 (void) VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
  329                 return (ENXIO);
  330         }
  331 
  332         /*
  333          * If we go beyond this, we get overflows in the radix
  334          * tree bitmap code.
  335          */
  336         if (nblks > 0x40000000 / BLIST_META_RADIX / nswdev) {
  337                 printf("exceeded maximum of %d blocks per swap unit\n",
  338                         0x40000000 / BLIST_META_RADIX / nswdev);
  339                 (void) VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
  340                 return (ENXIO);
  341         }
  342         /*
  343          * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
  344          * First chop nblks off to page-align it, then convert.
  345          * 
  346          * sw->sw_nblks is in page-sized chunks now too.
  347          */
  348         nblks &= ~(ctodb(1) - 1);
  349         nblks = dbtoc(nblks);
  350 
  351         sp->sw_vp = vp;
  352         sp->sw_dev = dev2udev(dev);
  353         sp->sw_device = dev;
  354         sp->sw_flags = SW_FREED;
  355         sp->sw_nblks = nblks;
  356         sp->sw_used = 0;
  357 
  358         /*
  359          * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not
  360          * DEV_BSIZE'd.   aligned_nblks is used to calculate the
  361          * size of the swap bitmap, taking into account the stripe size.
  362          */
  363         aligned_nblks = (nblks + (dmmax -1)) & ~(u_long)(dmmax -1);
  364 
  365         if (aligned_nblks * nswdev > nswap)
  366                 nswap = aligned_nblks * nswdev;
  367 
  368         if (swapblist == NULL)
  369                 swapblist = blist_create(nswap);
  370         else
  371                 blist_resize(&swapblist, nswap, 0);
  372 
  373         for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
  374                 blk = min(nblks - dvbase, dmmax);
  375                 vsbase = index * dmmax + dvbase * nswdev;
  376                 blist_free(swapblist, vsbase, blk);
  377                 vm_swap_size += blk;
  378         }
  379 
  380         swap_pager_full = 0;
  381 
  382         return (0);
  383 }
  384 
  385 /*
  386  * SYSCALL: swapoff(devname)
  387  *
  388  * Disable swapping on the given device.
  389  */
  390 #ifndef _SYS_SYSPROTO_H_
  391 struct swapoff_args {
  392         char *name;
  393 };
  394 #endif
  395 
  396 /*
  397  * MPSAFE
  398  */
  399 /* ARGSUSED */
  400 int
  401 swapoff(td, uap)
  402         struct thread *td;
  403         struct swapoff_args *uap;
  404 {
  405         struct vnode *vp;
  406         struct nameidata nd;
  407         struct swdevt *sp;
  408         swblk_t dvbase, vsbase;
  409         u_long nblks, aligned_nblks, blk;
  410         int error, index;
  411 
  412         mtx_lock(&Giant);
  413 
  414         error = suser(td);
  415         if (error)
  416                 goto done2;
  417 
  418         while (swdev_syscall_active)
  419             tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
  420         swdev_syscall_active = 1;
  421 
  422         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
  423         error = namei(&nd);
  424         if (error)
  425                 goto done;
  426         NDFREE(&nd, NDF_ONLY_PNBUF);
  427         vp = nd.ni_vp;
  428 
  429         for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) {
  430                 if (sp->sw_vp == vp)
  431                         goto found;
  432         }
  433         error = EINVAL;
  434         goto done;
  435 found:
  436 #ifdef MAC
  437         (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  438         error = mac_check_system_swapoff(td->td_ucred, vp);
  439         (void) VOP_UNLOCK(vp, 0, td);
  440         if (error != 0)
  441                 goto done;
  442 #endif
  443         
  444         nblks = sp->sw_nblks;
  445 
  446         /*
  447          * We can turn off this swap device safely only if the
  448          * available virtual memory in the system will fit the amount
  449          * of data we will have to page back in, plus an epsilon so
  450          * the system doesn't become critically low on swap space.
  451          */
  452         if (cnt.v_free_count + cnt.v_cache_count + vm_swap_size <
  453             nblks + nswap_lowat) {
  454                 error = ENOMEM;
  455                 goto done;
  456         }
  457 
  458         /*
  459          * Prevent further allocations on this device.
  460          */
  461         sp->sw_flags |= SW_CLOSING;
  462         for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
  463                 blk = min(nblks - dvbase, dmmax);
  464                 vsbase = index * dmmax + dvbase * nswdev;
  465                 vm_swap_size -= blist_fill(swapblist, vsbase, blk);
  466         }
  467 
  468         /*
  469          * Page in the contents of the device and close it.
  470          */
  471 #ifndef NO_SWAPPING
  472         vm_proc_swapin_all(index);
  473 #endif /* !NO_SWAPPING */
  474         swap_pager_swapoff(index, &sp->sw_used);
  475 
  476         VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
  477         vrele(vp);
  478         sp->sw_vp = NULL;
  479 
  480         /*
  481          * Resize the bitmap based on the new largest swap device,
  482          * or free the bitmap if there are no more devices.
  483          */
  484         for (sp = swdevt, nblks = 0; sp < swdevt + nswdev; sp++) {
  485                 if (sp->sw_vp == NULL)
  486                         continue;
  487                 nblks = max(nblks, sp->sw_nblks);
  488         }
  489 
  490         aligned_nblks = (nblks + (dmmax -1)) & ~(u_long)(dmmax -1);
  491         nswap = aligned_nblks * nswdev;
  492 
  493         if (nswap == 0) {
  494                 blist_destroy(swapblist);
  495                 swapblist = NULL;
  496                 vrele(swapdev_vp);
  497                 swapdev_vp = NULL;
  498         } else
  499                 blist_resize(&swapblist, nswap, 0);
  500 
  501 done:
  502         swdev_syscall_active = 0;
  503         wakeup_one(&swdev_syscall_active);
  504 done2:
  505         mtx_unlock(&Giant);
  506         return (error);
  507 }
  508 
  509 static int
  510 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
  511 {
  512         int     *name = (int *)arg1;
  513         int     error, i, n;
  514         struct xswdev xs;
  515         struct swdevt *sp;
  516 
  517         if (arg2 != 1) /* name length */
  518                 return (EINVAL);
  519 
  520         for (sp = swdevt, i = 0, n = 0 ; i < nswdev; i++, sp++) {
  521                 if (sp->sw_vp) {
  522                         if (n == *name) {
  523                                 xs.xsw_version = XSWDEV_VERSION;
  524                                 xs.xsw_dev = sp->sw_dev;
  525                                 xs.xsw_flags = sp->sw_flags;
  526                                 xs.xsw_nblks = sp->sw_nblks;
  527                                 xs.xsw_used = sp->sw_used;
  528 
  529                                 error = SYSCTL_OUT(req, &xs, sizeof(xs));
  530                                 return (error);
  531                         }
  532                         n++;
  533                 }
  534 
  535         }
  536         return (ENOENT);
  537 }
  538 
  539 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswdev, 0,
  540     "Number of swap devices");
  541 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
  542     "Swap statistics by device");

Cache object: ea0e5de57e9a0ab4b0689f5bfc55b645


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.