The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_swap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uvm_swap.c,v 1.140.4.1 2008/12/27 18:22:49 snj Exp $   */
    2 
    3 /*
    4  * Copyright (c) 1995, 1996, 1997 Matthew R. Green
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  *
   28  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
   29  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
   30  */
   31 
   32 #include <sys/cdefs.h>
   33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.140.4.1 2008/12/27 18:22:49 snj Exp $");
   34 
   35 #include "fs_nfs.h"
   36 #include "opt_uvmhist.h"
   37 #include "opt_compat_netbsd.h"
   38 #include "opt_ddb.h"
   39 
   40 #include <sys/param.h>
   41 #include <sys/systm.h>
   42 #include <sys/buf.h>
   43 #include <sys/bufq.h>
   44 #include <sys/conf.h>
   45 #include <sys/proc.h>
   46 #include <sys/namei.h>
   47 #include <sys/disklabel.h>
   48 #include <sys/errno.h>
   49 #include <sys/kernel.h>
   50 #include <sys/malloc.h>
   51 #include <sys/vnode.h>
   52 #include <sys/file.h>
   53 #include <sys/vmem.h>
   54 #include <sys/blist.h>
   55 #include <sys/mount.h>
   56 #include <sys/pool.h>
   57 #include <sys/syscallargs.h>
   58 #include <sys/swap.h>
   59 #include <sys/kauth.h>
   60 #include <sys/sysctl.h>
   61 #include <sys/workqueue.h>
   62 
   63 #include <uvm/uvm.h>
   64 
   65 #include <miscfs/specfs/specdev.h>
   66 
   67 /*
   68  * uvm_swap.c: manage configuration and i/o to swap space.
   69  */
   70 
   71 /*
   72  * swap space is managed in the following way:
   73  *
   74  * each swap partition or file is described by a "swapdev" structure.
   75  * each "swapdev" structure contains a "swapent" structure which contains
   76  * information that is passed up to the user (via system calls).
   77  *
   78  * each swap partition is assigned a "priority" (int) which controls
   79  * swap parition usage.
   80  *
   81  * the system maintains a global data structure describing all swap
   82  * partitions/files.   there is a sorted LIST of "swappri" structures
   83  * which describe "swapdev"'s at that priority.   this LIST is headed
   84  * by the "swap_priority" global var.    each "swappri" contains a
   85  * CIRCLEQ of "swapdev" structures at that priority.
   86  *
   87  * locking:
   88  *  - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
   89  *    system call and prevents the swap priority list from changing
   90  *    while we are in the middle of a system call (e.g. SWAP_STATS).
   91  *  - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
   92  *    structures including the priority list, the swapdev structures,
   93  *    and the swapmap arena.
   94  *
   95  * each swap device has the following info:
   96  *  - swap device in use (could be disabled, preventing future use)
   97  *  - swap enabled (allows new allocations on swap)
   98  *  - map info in /dev/drum
   99  *  - vnode pointer
  100  * for swap files only:
  101  *  - block size
  102  *  - max byte count in buffer
  103  *  - buffer
  104  *
  105  * userland controls and configures swap with the swapctl(2) system call.
  106  * the sys_swapctl performs the following operations:
  107  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
  108  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
  109  *      (passed in via "arg") of a size passed in via "misc" ... we load
  110  *      the current swap config into the array. The actual work is done
  111  *      in the uvm_swap_stats(9) function.
  112  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
  113  *      priority in "misc", start swapping on it.
  114  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
  115  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
  116  *      "misc")
  117  */
  118 
  119 /*
  120  * swapdev: describes a single swap partition/file
  121  *
  122  * note the following should be true:
  123  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
  124  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
  125  */
  126 struct swapdev {
  127         struct oswapent swd_ose;
  128 #define swd_dev         swd_ose.ose_dev         /* device id */
  129 #define swd_flags       swd_ose.ose_flags       /* flags:inuse/enable/fake */
  130 #define swd_priority    swd_ose.ose_priority    /* our priority */
  131         /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */
  132         char                    *swd_path;      /* saved pathname of device */
  133         int                     swd_pathlen;    /* length of pathname */
  134         int                     swd_npages;     /* #pages we can use */
  135         int                     swd_npginuse;   /* #pages in use */
  136         int                     swd_npgbad;     /* #pages bad */
  137         int                     swd_drumoffset; /* page0 offset in drum */
  138         int                     swd_drumsize;   /* #pages in drum */
  139         blist_t                 swd_blist;      /* blist for this swapdev */
  140         struct vnode            *swd_vp;        /* backing vnode */
  141         CIRCLEQ_ENTRY(swapdev)  swd_next;       /* priority circleq */
  142 
  143         int                     swd_bsize;      /* blocksize (bytes) */
  144         int                     swd_maxactive;  /* max active i/o reqs */
  145         struct bufq_state       *swd_tab;       /* buffer list */
  146         int                     swd_active;     /* number of active buffers */
  147 };
  148 
  149 /*
  150  * swap device priority entry; the list is kept sorted on `spi_priority'.
  151  */
  152 struct swappri {
  153         int                     spi_priority;     /* priority */
  154         CIRCLEQ_HEAD(spi_swapdev, swapdev)      spi_swapdev;
  155         /* circleq of swapdevs at this priority */
  156         LIST_ENTRY(swappri)     spi_swappri;      /* global list of pri's */
  157 };
  158 
  159 /*
  160  * The following two structures are used to keep track of data transfers
  161  * on swap devices associated with regular files.
  162  * NOTE: this code is more or less a copy of vnd.c; we use the same
  163  * structure names here to ease porting..
  164  */
  165 struct vndxfer {
  166         struct buf      *vx_bp;         /* Pointer to parent buffer */
  167         struct swapdev  *vx_sdp;
  168         int             vx_error;
  169         int             vx_pending;     /* # of pending aux buffers */
  170         int             vx_flags;
  171 #define VX_BUSY         1
  172 #define VX_DEAD         2
  173 };
  174 
  175 struct vndbuf {
  176         struct buf      vb_buf;
  177         struct vndxfer  *vb_xfer;
  178 };
  179 
  180 
  181 /*
  182  * We keep a of pool vndbuf's and vndxfer structures.
  183  */
  184 POOL_INIT(vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", NULL,
  185     IPL_BIO);
  186 POOL_INIT(vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", NULL,
  187     IPL_BIO);
  188 
  189 /*
  190  * local variables
  191  */
  192 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures");
  193 static vmem_t *swapmap; /* controls the mapping of /dev/drum */
  194 
  195 /* list of all active swap devices [by priority] */
  196 LIST_HEAD(swap_priority, swappri);
  197 static struct swap_priority swap_priority;
  198 
  199 /* locks */
  200 static krwlock_t swap_syscall_lock;
  201 
  202 /* workqueue and use counter for swap to regular files */
  203 static int sw_reg_count = 0;
  204 static struct workqueue *sw_reg_workqueue;
  205 
  206 /* tuneables */
  207 u_int uvm_swapisfull_factor = 99;
  208 
  209 /*
  210  * prototypes
  211  */
  212 static struct swapdev   *swapdrum_getsdp(int);
  213 
  214 static struct swapdev   *swaplist_find(struct vnode *, bool);
  215 static void              swaplist_insert(struct swapdev *,
  216                                          struct swappri *, int);
  217 static void              swaplist_trim(void);
  218 
  219 static int swap_on(struct lwp *, struct swapdev *);
  220 static int swap_off(struct lwp *, struct swapdev *);
  221 
  222 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *);
  223 
  224 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
  225 static void sw_reg_biodone(struct buf *);
  226 static void sw_reg_iodone(struct work *wk, void *dummy);
  227 static void sw_reg_start(struct swapdev *);
  228 
  229 static int uvm_swap_io(struct vm_page **, int, int, int);
  230 
  231 /*
  232  * uvm_swap_init: init the swap system data structures and locks
  233  *
  234  * => called at boot time from init_main.c after the filesystems
  235  *      are brought up (which happens after uvm_init())
  236  */
  237 void
  238 uvm_swap_init(void)
  239 {
  240         UVMHIST_FUNC("uvm_swap_init");
  241 
  242         UVMHIST_CALLED(pdhist);
  243         /*
  244          * first, init the swap list, its counter, and its lock.
  245          * then get a handle on the vnode for /dev/drum by using
  246          * the its dev_t number ("swapdev", from MD conf.c).
  247          */
  248 
  249         LIST_INIT(&swap_priority);
  250         uvmexp.nswapdev = 0;
  251         rw_init(&swap_syscall_lock);
  252         cv_init(&uvm.scheduler_cv, "schedule");
  253         mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
  254 
  255         /* XXXSMP should be at IPL_VM, but for audio interrupt handlers. */
  256         mutex_init(&uvm_scheduler_mutex, MUTEX_SPIN, IPL_SCHED);
  257 
  258         if (bdevvp(swapdev, &swapdev_vp))
  259                 panic("uvm_swap_init: can't get vnode for swap device");
  260         if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
  261                 panic("uvm_swap_init: can't lock swap device");
  262         if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
  263                 panic("uvm_swap_init: can't open swap device");
  264         VOP_UNLOCK(swapdev_vp, 0);
  265 
  266         /*
  267          * create swap block resource map to map /dev/drum.   the range
  268          * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
  269          * that block 0 is reserved (used to indicate an allocation
  270          * failure, or no allocation).
  271          */
  272         swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
  273             VM_NOSLEEP, IPL_NONE);
  274         if (swapmap == 0)
  275                 panic("uvm_swap_init: extent_create failed");
  276 
  277         /*
  278          * done!
  279          */
  280         uvm.swap_running = true;
  281 #ifdef __SWAP_BROKEN
  282         uvm.swapout_enabled = 0;
  283 #else
  284         uvm.swapout_enabled = 1;
  285 #endif
  286         UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
  287 
  288         sysctl_createv(NULL, 0, NULL, NULL,
  289             CTLFLAG_READWRITE,
  290             CTLTYPE_INT, "swapout",
  291             SYSCTL_DESCR("Set 0 to disable swapout of kernel stacks"),
  292             NULL, 0, &uvm.swapout_enabled, 0, CTL_VM, CTL_CREATE, CTL_EOL);
  293 }
  294 
  295 /*
  296  * swaplist functions: functions that operate on the list of swap
  297  * devices on the system.
  298  */
  299 
  300 /*
  301  * swaplist_insert: insert swap device "sdp" into the global list
  302  *
  303  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  304  * => caller must provide a newly malloc'd swappri structure (we will
  305  *      FREE it if we don't need it... this it to prevent malloc blocking
  306  *      here while adding swap)
  307  */
  308 static void
  309 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
  310 {
  311         struct swappri *spp, *pspp;
  312         UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
  313 
  314         /*
  315          * find entry at or after which to insert the new device.
  316          */
  317         pspp = NULL;
  318         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
  319                 if (priority <= spp->spi_priority)
  320                         break;
  321                 pspp = spp;
  322         }
  323 
  324         /*
  325          * new priority?
  326          */
  327         if (spp == NULL || spp->spi_priority != priority) {
  328                 spp = newspp;  /* use newspp! */
  329                 UVMHIST_LOG(pdhist, "created new swappri = %d",
  330                             priority, 0, 0, 0);
  331 
  332                 spp->spi_priority = priority;
  333                 CIRCLEQ_INIT(&spp->spi_swapdev);
  334 
  335                 if (pspp)
  336                         LIST_INSERT_AFTER(pspp, spp, spi_swappri);
  337                 else
  338                         LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
  339         } else {
  340                 /* we don't need a new priority structure, free it */
  341                 FREE(newspp, M_VMSWAP);
  342         }
  343 
  344         /*
  345          * priority found (or created).   now insert on the priority's
  346          * circleq list and bump the total number of swapdevs.
  347          */
  348         sdp->swd_priority = priority;
  349         CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
  350         uvmexp.nswapdev++;
  351 }
  352 
  353 /*
  354  * swaplist_find: find and optionally remove a swap device from the
  355  *      global list.
  356  *
  357  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  358  * => we return the swapdev we found (and removed)
  359  */
  360 static struct swapdev *
  361 swaplist_find(struct vnode *vp, bool remove)
  362 {
  363         struct swapdev *sdp;
  364         struct swappri *spp;
  365 
  366         /*
  367          * search the lists for the requested vp
  368          */
  369 
  370         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
  371                 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
  372                         if (sdp->swd_vp == vp) {
  373                                 if (remove) {
  374                                         CIRCLEQ_REMOVE(&spp->spi_swapdev,
  375                                             sdp, swd_next);
  376                                         uvmexp.nswapdev--;
  377                                 }
  378                                 return(sdp);
  379                         }
  380                 }
  381         }
  382         return (NULL);
  383 }
  384 
  385 /*
  386  * swaplist_trim: scan priority list for empty priority entries and kill
  387  *      them.
  388  *
  389  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  390  */
  391 static void
  392 swaplist_trim(void)
  393 {
  394         struct swappri *spp, *nextspp;
  395 
  396         for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
  397                 nextspp = LIST_NEXT(spp, spi_swappri);
  398                 if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
  399                     (void *)&spp->spi_swapdev)
  400                         continue;
  401                 LIST_REMOVE(spp, spi_swappri);
  402                 free(spp, M_VMSWAP);
  403         }
  404 }
  405 
  406 /*
  407  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
  408  *      to the "swapdev" that maps that section of the drum.
  409  *
  410  * => each swapdev takes one big contig chunk of the drum
  411  * => caller must hold uvm_swap_data_lock
  412  */
  413 static struct swapdev *
  414 swapdrum_getsdp(int pgno)
  415 {
  416         struct swapdev *sdp;
  417         struct swappri *spp;
  418 
  419         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
  420                 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
  421                         if (sdp->swd_flags & SWF_FAKE)
  422                                 continue;
  423                         if (pgno >= sdp->swd_drumoffset &&
  424                             pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
  425                                 return sdp;
  426                         }
  427                 }
  428         }
  429         return NULL;
  430 }
  431 
  432 
  433 /*
  434  * sys_swapctl: main entry point for swapctl(2) system call
  435  *      [with two helper functions: swap_on and swap_off]
  436  */
  437 int
  438 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
  439 {
  440         /* {
  441                 syscallarg(int) cmd;
  442                 syscallarg(void *) arg;
  443                 syscallarg(int) misc;
  444         } */
  445         struct vnode *vp;
  446         struct nameidata nd;
  447         struct swappri *spp;
  448         struct swapdev *sdp;
  449         struct swapent *sep;
  450 #define SWAP_PATH_MAX (PATH_MAX + 1)
  451         char    *userpath;
  452         size_t  len;
  453         int     error, misc;
  454         int     priority;
  455         UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
  456 
  457         misc = SCARG(uap, misc);
  458 
  459         /*
  460          * ensure serialized syscall access by grabbing the swap_syscall_lock
  461          */
  462         rw_enter(&swap_syscall_lock, RW_WRITER);
  463 
  464         userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK);
  465         /*
  466          * we handle the non-priv NSWAP and STATS request first.
  467          *
  468          * SWAP_NSWAP: return number of config'd swap devices
  469          * [can also be obtained with uvmexp sysctl]
  470          */
  471         if (SCARG(uap, cmd) == SWAP_NSWAP) {
  472                 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
  473                     0, 0, 0);
  474                 *retval = uvmexp.nswapdev;
  475                 error = 0;
  476                 goto out;
  477         }
  478 
  479         /*
  480          * SWAP_STATS: get stats on current # of configured swap devs
  481          *
  482          * note that the swap_priority list can't change as long
  483          * as we are holding the swap_syscall_lock.  we don't want
  484          * to grab the uvm_swap_data_lock because we may fault&sleep during
  485          * copyout() and we don't want to be holding that lock then!
  486          */
  487         if (SCARG(uap, cmd) == SWAP_STATS
  488 #if defined(COMPAT_13)
  489             || SCARG(uap, cmd) == SWAP_OSTATS
  490 #endif
  491             ) {
  492                 if ((size_t)misc > (size_t)uvmexp.nswapdev)
  493                         misc = uvmexp.nswapdev;
  494 #if defined(COMPAT_13)
  495                 if (SCARG(uap, cmd) == SWAP_OSTATS)
  496                         len = sizeof(struct oswapent) * misc;
  497                 else
  498 #endif
  499                         len = sizeof(struct swapent) * misc;
  500                 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK);
  501 
  502                 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval);
  503                 error = copyout(sep, SCARG(uap, arg), len);
  504 
  505                 free(sep, M_TEMP);
  506                 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
  507                 goto out;
  508         }
  509         if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
  510                 dev_t   *devp = (dev_t *)SCARG(uap, arg);
  511 
  512                 error = copyout(&dumpdev, devp, sizeof(dumpdev));
  513                 goto out;
  514         }
  515 
  516         /*
  517          * all other requests require superuser privs.   verify.
  518          */
  519         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
  520             0, NULL, NULL, NULL)))
  521                 goto out;
  522 
  523         if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
  524                 /* drop the current dump device */
  525                 dumpdev = NODEV;
  526                 dumpcdev = NODEV;
  527                 cpu_dumpconf();
  528                 goto out;
  529         }
  530 
  531         /*
  532          * at this point we expect a path name in arg.   we will
  533          * use namei() to gain a vnode reference (vref), and lock
  534          * the vnode (VOP_LOCK).
  535          *
  536          * XXX: a NULL arg means use the root vnode pointer (e.g. for
  537          * miniroot)
  538          */
  539         if (SCARG(uap, arg) == NULL) {
  540                 vp = rootvp;            /* miniroot */
  541                 if (vget(vp, LK_EXCLUSIVE)) {
  542                         error = EBUSY;
  543                         goto out;
  544                 }
  545                 if (SCARG(uap, cmd) == SWAP_ON &&
  546                     copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
  547                         panic("swapctl: miniroot copy failed");
  548         } else {
  549                 int     space;
  550                 char    *where;
  551 
  552                 if (SCARG(uap, cmd) == SWAP_ON) {
  553                         if ((error = copyinstr(SCARG(uap, arg), userpath,
  554                             SWAP_PATH_MAX, &len)))
  555                                 goto out;
  556                         space = UIO_SYSSPACE;
  557                         where = userpath;
  558                 } else {
  559                         space = UIO_USERSPACE;
  560                         where = (char *)SCARG(uap, arg);
  561                 }
  562                 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT,
  563                     space, where);
  564                 if ((error = namei(&nd)))
  565                         goto out;
  566                 vp = nd.ni_vp;
  567         }
  568         /* note: "vp" is referenced and locked */
  569 
  570         error = 0;              /* assume no error */
  571         switch(SCARG(uap, cmd)) {
  572 
  573         case SWAP_DUMPDEV:
  574                 if (vp->v_type != VBLK) {
  575                         error = ENOTBLK;
  576                         break;
  577                 }
  578                 if (bdevsw_lookup(vp->v_rdev)) {
  579                         dumpdev = vp->v_rdev;
  580                         dumpcdev = devsw_blk2chr(dumpdev);
  581                 } else
  582                         dumpdev = NODEV;
  583                 cpu_dumpconf();
  584                 break;
  585 
  586         case SWAP_CTL:
  587                 /*
  588                  * get new priority, remove old entry (if any) and then
  589                  * reinsert it in the correct place.  finally, prune out
  590                  * any empty priority structures.
  591                  */
  592                 priority = SCARG(uap, misc);
  593                 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
  594                 mutex_enter(&uvm_swap_data_lock);
  595                 if ((sdp = swaplist_find(vp, true)) == NULL) {
  596                         error = ENOENT;
  597                 } else {
  598                         swaplist_insert(sdp, spp, priority);
  599                         swaplist_trim();
  600                 }
  601                 mutex_exit(&uvm_swap_data_lock);
  602                 if (error)
  603                         free(spp, M_VMSWAP);
  604                 break;
  605 
  606         case SWAP_ON:
  607 
  608                 /*
  609                  * check for duplicates.   if none found, then insert a
  610                  * dummy entry on the list to prevent someone else from
  611                  * trying to enable this device while we are working on
  612                  * it.
  613                  */
  614 
  615                 priority = SCARG(uap, misc);
  616                 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
  617                 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
  618                 memset(sdp, 0, sizeof(*sdp));
  619                 sdp->swd_flags = SWF_FAKE;
  620                 sdp->swd_vp = vp;
  621                 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
  622                 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
  623                 mutex_enter(&uvm_swap_data_lock);
  624                 if (swaplist_find(vp, false) != NULL) {
  625                         error = EBUSY;
  626                         mutex_exit(&uvm_swap_data_lock);
  627                         bufq_free(sdp->swd_tab);
  628                         free(sdp, M_VMSWAP);
  629                         free(spp, M_VMSWAP);
  630                         break;
  631                 }
  632                 swaplist_insert(sdp, spp, priority);
  633                 mutex_exit(&uvm_swap_data_lock);
  634 
  635                 sdp->swd_pathlen = len;
  636                 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
  637                 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
  638                         panic("swapctl: copystr");
  639 
  640                 /*
  641                  * we've now got a FAKE placeholder in the swap list.
  642                  * now attempt to enable swap on it.  if we fail, undo
  643                  * what we've done and kill the fake entry we just inserted.
  644                  * if swap_on is a success, it will clear the SWF_FAKE flag
  645                  */
  646 
  647                 if ((error = swap_on(l, sdp)) != 0) {
  648                         mutex_enter(&uvm_swap_data_lock);
  649                         (void) swaplist_find(vp, true);  /* kill fake entry */
  650                         swaplist_trim();
  651                         mutex_exit(&uvm_swap_data_lock);
  652                         bufq_free(sdp->swd_tab);
  653                         free(sdp->swd_path, M_VMSWAP);
  654                         free(sdp, M_VMSWAP);
  655                         break;
  656                 }
  657                 break;
  658 
  659         case SWAP_OFF:
  660                 mutex_enter(&uvm_swap_data_lock);
  661                 if ((sdp = swaplist_find(vp, false)) == NULL) {
  662                         mutex_exit(&uvm_swap_data_lock);
  663                         error = ENXIO;
  664                         break;
  665                 }
  666 
  667                 /*
  668                  * If a device isn't in use or enabled, we
  669                  * can't stop swapping from it (again).
  670                  */
  671                 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
  672                         mutex_exit(&uvm_swap_data_lock);
  673                         error = EBUSY;
  674                         break;
  675                 }
  676 
  677                 /*
  678                  * do the real work.
  679                  */
  680                 error = swap_off(l, sdp);
  681                 break;
  682 
  683         default:
  684                 error = EINVAL;
  685         }
  686 
  687         /*
  688          * done!  release the ref gained by namei() and unlock.
  689          */
  690         vput(vp);
  691 
  692 out:
  693         free(userpath, M_TEMP);
  694         rw_exit(&swap_syscall_lock);
  695 
  696         UVMHIST_LOG(pdhist, "<- done!  error=%d", error, 0, 0, 0);
  697         return (error);
  698 }
  699 
  700 /*
  701  * swap_stats: implements swapctl(SWAP_STATS). The function is kept
  702  * away from sys_swapctl() in order to allow COMPAT_* swapctl()
  703  * emulation to use it directly without going through sys_swapctl().
  704  * The problem with using sys_swapctl() there is that it involves
  705  * copying the swapent array to the stackgap, and this array's size
  706  * is not known at build time. Hence it would not be possible to
  707  * ensure it would fit in the stackgap in any case.
  708  */
  709 void
  710 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval)
  711 {
  712 
  713         rw_enter(&swap_syscall_lock, RW_READER);
  714         uvm_swap_stats_locked(cmd, sep, sec, retval);
  715         rw_exit(&swap_syscall_lock);
  716 }
  717 
  718 static void
  719 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval)
  720 {
  721         struct swappri *spp;
  722         struct swapdev *sdp;
  723         int count = 0;
  724 
  725         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
  726                 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
  727                      sdp != (void *)&spp->spi_swapdev && sec-- > 0;
  728                      sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
  729                         /*
  730                          * backwards compatibility for system call.
  731                          * note that we use 'struct oswapent' as an
  732                          * overlay into both 'struct swapdev' and
  733                          * the userland 'struct swapent', as we
  734                          * want to retain backwards compatibility
  735                          * with NetBSD 1.3.
  736                          */
  737                         sdp->swd_ose.ose_inuse =
  738                             btodb((uint64_t)sdp->swd_npginuse <<
  739                             PAGE_SHIFT);
  740                         (void)memcpy(sep, &sdp->swd_ose,
  741                             sizeof(struct oswapent));
  742 
  743                         /* now copy out the path if necessary */
  744 #if !defined(COMPAT_13)
  745                         (void) cmd;
  746 #endif
  747 #if defined(COMPAT_13)
  748                         if (cmd == SWAP_STATS)
  749 #endif
  750                                 (void)memcpy(&sep->se_path, sdp->swd_path,
  751                                     sdp->swd_pathlen);
  752 
  753                         count++;
  754 #if defined(COMPAT_13)
  755                         if (cmd == SWAP_OSTATS)
  756                                 sep = (struct swapent *)
  757                                     ((struct oswapent *)sep + 1);
  758                         else
  759 #endif
  760                                 sep++;
  761                 }
  762         }
  763 
  764         *retval = count;
  765         return;
  766 }
  767 
  768 /*
  769  * swap_on: attempt to enable a swapdev for swapping.   note that the
  770  *      swapdev is already on the global list, but disabled (marked
  771  *      SWF_FAKE).
  772  *
  773  * => we avoid the start of the disk (to protect disk labels)
  774  * => we also avoid the miniroot, if we are swapping to root.
  775  * => caller should leave uvm_swap_data_lock unlocked, we may lock it
  776  *      if needed.
  777  */
  778 static int
  779 swap_on(struct lwp *l, struct swapdev *sdp)
  780 {
  781         struct vnode *vp;
  782         int error, npages, nblocks, size;
  783         long addr;
  784         u_long result;
  785         struct vattr va;
  786 #ifdef NFS
  787         extern int (**nfsv2_vnodeop_p)(void *);
  788 #endif /* NFS */
  789         const struct bdevsw *bdev;
  790         dev_t dev;
  791         UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
  792 
  793         /*
  794          * we want to enable swapping on sdp.   the swd_vp contains
  795          * the vnode we want (locked and ref'd), and the swd_dev
  796          * contains the dev_t of the file, if it a block device.
  797          */
  798 
  799         vp = sdp->swd_vp;
  800         dev = sdp->swd_dev;
  801 
  802         /*
  803          * open the swap file (mostly useful for block device files to
  804          * let device driver know what is up).
  805          *
  806          * we skip the open/close for root on swap because the root
  807          * has already been opened when root was mounted (mountroot).
  808          */
  809         if (vp != rootvp) {
  810                 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
  811                         return (error);
  812         }
  813 
  814         /* XXX this only works for block devices */
  815         UVMHIST_LOG(pdhist, "  dev=%d, major(dev)=%d", dev, major(dev), 0,0);
  816 
  817         /*
  818          * we now need to determine the size of the swap area.   for
  819          * block specials we can call the d_psize function.
  820          * for normal files, we must stat [get attrs].
  821          *
  822          * we put the result in nblks.
  823          * for normal files, we also want the filesystem block size
  824          * (which we get with statfs).
  825          */
  826         switch (vp->v_type) {
  827         case VBLK:
  828                 bdev = bdevsw_lookup(dev);
  829                 if (bdev == NULL || bdev->d_psize == NULL ||
  830                     (nblocks = (*bdev->d_psize)(dev)) == -1) {
  831                         error = ENXIO;
  832                         goto bad;
  833                 }
  834                 break;
  835 
  836         case VREG:
  837                 if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
  838                         goto bad;
  839                 nblocks = (int)btodb(va.va_size);
  840                 if ((error =
  841                      VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat)) != 0)
  842                         goto bad;
  843 
  844                 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
  845                 /*
  846                  * limit the max # of outstanding I/O requests we issue
  847                  * at any one time.   take it easy on NFS servers.
  848                  */
  849 #ifdef NFS
  850                 if (vp->v_op == nfsv2_vnodeop_p)
  851                         sdp->swd_maxactive = 2; /* XXX */
  852                 else
  853 #endif /* NFS */
  854                         sdp->swd_maxactive = 8; /* XXX */
  855                 break;
  856 
  857         default:
  858                 error = ENXIO;
  859                 goto bad;
  860         }
  861 
  862         /*
  863          * save nblocks in a safe place and convert to pages.
  864          */
  865 
  866         sdp->swd_ose.ose_nblks = nblocks;
  867         npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
  868 
  869         /*
  870          * for block special files, we want to make sure that leave
  871          * the disklabel and bootblocks alone, so we arrange to skip
  872          * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
  873          * note that because of this the "size" can be less than the
  874          * actual number of blocks on the device.
  875          */
  876         if (vp->v_type == VBLK) {
  877                 /* we use pages 1 to (size - 1) [inclusive] */
  878                 size = npages - 1;
  879                 addr = 1;
  880         } else {
  881                 /* we use pages 0 to (size - 1) [inclusive] */
  882                 size = npages;
  883                 addr = 0;
  884         }
  885 
  886         /*
  887          * make sure we have enough blocks for a reasonable sized swap
  888          * area.   we want at least one page.
  889          */
  890 
  891         if (size < 1) {
  892                 UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
  893                 error = EINVAL;
  894                 goto bad;
  895         }
  896 
  897         UVMHIST_LOG(pdhist, "  dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
  898 
  899         /*
  900          * now we need to allocate an extent to manage this swap device
  901          */
  902 
  903         sdp->swd_blist = blist_create(npages);
  904         /* mark all expect the `saved' region free. */
  905         blist_free(sdp->swd_blist, addr, size);
  906 
  907         /*
  908          * if the vnode we are swapping to is the root vnode
  909          * (i.e. we are swapping to the miniroot) then we want
  910          * to make sure we don't overwrite it.   do a statfs to
  911          * find its size and skip over it.
  912          */
  913         if (vp == rootvp) {
  914                 struct mount *mp;
  915                 struct statvfs *sp;
  916                 int rootblocks, rootpages;
  917 
  918                 mp = rootvnode->v_mount;
  919                 sp = &mp->mnt_stat;
  920                 rootblocks = sp->f_blocks * btodb(sp->f_frsize);
  921                 /*
  922                  * XXX: sp->f_blocks isn't the total number of
  923                  * blocks in the filesystem, it's the number of
  924                  * data blocks.  so, our rootblocks almost
  925                  * definitely underestimates the total size
  926                  * of the filesystem - how badly depends on the
  927                  * details of the filesystem type.  there isn't
  928                  * an obvious way to deal with this cleanly
  929                  * and perfectly, so for now we just pad our
  930                  * rootblocks estimate with an extra 5 percent.
  931                  */
  932                 rootblocks += (rootblocks >> 5) +
  933                         (rootblocks >> 6) +
  934                         (rootblocks >> 7);
  935                 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
  936                 if (rootpages > size)
  937                         panic("swap_on: miniroot larger than swap?");
  938 
  939                 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
  940                         panic("swap_on: unable to preserve miniroot");
  941                 }
  942 
  943                 size -= rootpages;
  944                 printf("Preserved %d pages of miniroot ", rootpages);
  945                 printf("leaving %d pages of swap\n", size);
  946         }
  947 
  948         /*
  949          * add a ref to vp to reflect usage as a swap device.
  950          */
  951         vref(vp);
  952 
  953         /*
  954          * now add the new swapdev to the drum and enable.
  955          */
  956         result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP);
  957         if (result == 0)
  958                 panic("swapdrum_add");
  959         /*
  960          * If this is the first regular swap create the workqueue.
  961          * => Protected by swap_syscall_lock.
  962          */
  963         if (vp->v_type != VBLK) {
  964                 if (sw_reg_count++ == 0) {
  965                         KASSERT(sw_reg_workqueue == NULL);
  966                         if (workqueue_create(&sw_reg_workqueue, "swapiod",
  967                             sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
  968                                 panic("swap_add: workqueue_create failed");
  969                 }
  970         }
  971 
  972         sdp->swd_drumoffset = (int)result;
  973         sdp->swd_drumsize = npages;
  974         sdp->swd_npages = size;
  975         mutex_enter(&uvm_swap_data_lock);
  976         sdp->swd_flags &= ~SWF_FAKE;    /* going live */
  977         sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
  978         uvmexp.swpages += size;
  979         uvmexp.swpgavail += size;
  980         mutex_exit(&uvm_swap_data_lock);
  981         return (0);
  982 
  983         /*
  984          * failure: clean up and return error.
  985          */
  986 
  987 bad:
  988         if (sdp->swd_blist) {
  989                 blist_destroy(sdp->swd_blist);
  990         }
  991         if (vp != rootvp) {
  992                 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
  993         }
  994         return (error);
  995 }
  996 
  997 /*
  998  * swap_off: stop swapping on swapdev
  999  *
 1000  * => swap data should be locked, we will unlock.
 1001  */
 1002 static int
 1003 swap_off(struct lwp *l, struct swapdev *sdp)
 1004 {
 1005         int npages = sdp->swd_npages;
 1006         int error = 0;
 1007 
 1008         UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
 1009         UVMHIST_LOG(pdhist, "  dev=%x, npages=%d", sdp->swd_dev,npages,0,0);
 1010 
 1011         /* disable the swap area being removed */
 1012         sdp->swd_flags &= ~SWF_ENABLE;
 1013         uvmexp.swpgavail -= npages;
 1014         mutex_exit(&uvm_swap_data_lock);
 1015 
 1016         /*
 1017          * the idea is to find all the pages that are paged out to this
 1018          * device, and page them all in.  in uvm, swap-backed pageable
 1019          * memory can take two forms: aobjs and anons.  call the
 1020          * swapoff hook for each subsystem to bring in pages.
 1021          */
 1022 
 1023         if (uao_swap_off(sdp->swd_drumoffset,
 1024                          sdp->swd_drumoffset + sdp->swd_drumsize) ||
 1025             amap_swap_off(sdp->swd_drumoffset,
 1026                           sdp->swd_drumoffset + sdp->swd_drumsize)) {
 1027                 error = ENOMEM;
 1028         } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
 1029                 error = EBUSY;
 1030         }
 1031 
 1032         if (error) {
 1033                 mutex_enter(&uvm_swap_data_lock);
 1034                 sdp->swd_flags |= SWF_ENABLE;
 1035                 uvmexp.swpgavail += npages;
 1036                 mutex_exit(&uvm_swap_data_lock);
 1037 
 1038                 return error;
 1039         }
 1040 
 1041         /*
 1042          * If this is the last regular swap destroy the workqueue.
 1043          * => Protected by swap_syscall_lock.
 1044          */
 1045         if (sdp->swd_vp->v_type != VBLK) {
 1046                 KASSERT(sw_reg_count > 0);
 1047                 KASSERT(sw_reg_workqueue != NULL);
 1048                 if (--sw_reg_count == 0) {
 1049                         workqueue_destroy(sw_reg_workqueue);
 1050                         sw_reg_workqueue = NULL;
 1051                 }
 1052         }
 1053 
 1054         /*
 1055          * done with the vnode.
 1056          * drop our ref on the vnode before calling VOP_CLOSE()
 1057          * so that spec_close() can tell if this is the last close.
 1058          */
 1059         vrele(sdp->swd_vp);
 1060         if (sdp->swd_vp != rootvp) {
 1061                 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
 1062         }
 1063 
 1064         mutex_enter(&uvm_swap_data_lock);
 1065         uvmexp.swpages -= npages;
 1066         uvmexp.swpginuse -= sdp->swd_npgbad;
 1067 
 1068         if (swaplist_find(sdp->swd_vp, true) == NULL)
 1069                 panic("swap_off: swapdev not in list");
 1070         swaplist_trim();
 1071         mutex_exit(&uvm_swap_data_lock);
 1072 
 1073         /*
 1074          * free all resources!
 1075          */
 1076         vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
 1077         blist_destroy(sdp->swd_blist);
 1078         bufq_free(sdp->swd_tab);
 1079         free(sdp, M_VMSWAP);
 1080         return (0);
 1081 }
 1082 
 1083 /*
 1084  * /dev/drum interface and i/o functions
 1085  */
 1086 
 1087 /*
 1088  * swstrategy: perform I/O on the drum
 1089  *
 1090  * => we must map the i/o request from the drum to the correct swapdev.
 1091  */
 1092 static void
 1093 swstrategy(struct buf *bp)
 1094 {
 1095         struct swapdev *sdp;
 1096         struct vnode *vp;
 1097         int pageno, bn;
 1098         UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
 1099 
 1100         /*
 1101          * convert block number to swapdev.   note that swapdev can't
 1102          * be yanked out from under us because we are holding resources
 1103          * in it (i.e. the blocks we are doing I/O on).
 1104          */
 1105         pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
 1106         mutex_enter(&uvm_swap_data_lock);
 1107         sdp = swapdrum_getsdp(pageno);
 1108         mutex_exit(&uvm_swap_data_lock);
 1109         if (sdp == NULL) {
 1110                 bp->b_error = EINVAL;
 1111                 biodone(bp);
 1112                 UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
 1113                 return;
 1114         }
 1115 
 1116         /*
 1117          * convert drum page number to block number on this swapdev.
 1118          */
 1119 
 1120         pageno -= sdp->swd_drumoffset;  /* page # on swapdev */
 1121         bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
 1122 
 1123         UVMHIST_LOG(pdhist, "  %s: mapoff=%x bn=%x bcount=%ld",
 1124                 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
 1125                 sdp->swd_drumoffset, bn, bp->b_bcount);
 1126 
 1127         /*
 1128          * for block devices we finish up here.
 1129          * for regular files we have to do more work which we delegate
 1130          * to sw_reg_strategy().
 1131          */
 1132 
 1133         vp = sdp->swd_vp;               /* swapdev vnode pointer */
 1134         switch (vp->v_type) {
 1135         default:
 1136                 panic("swstrategy: vnode type 0x%x", vp->v_type);
 1137 
 1138         case VBLK:
 1139 
 1140                 /*
 1141                  * must convert "bp" from an I/O on /dev/drum to an I/O
 1142                  * on the swapdev (sdp).
 1143                  */
 1144                 bp->b_blkno = bn;               /* swapdev block number */
 1145                 bp->b_dev = sdp->swd_dev;       /* swapdev dev_t */
 1146 
 1147                 /*
 1148                  * if we are doing a write, we have to redirect the i/o on
 1149                  * drum's v_numoutput counter to the swapdevs.
 1150                  */
 1151                 if ((bp->b_flags & B_READ) == 0) {
 1152                         mutex_enter(bp->b_objlock);
 1153                         vwakeup(bp);    /* kills one 'v_numoutput' on drum */
 1154                         mutex_exit(bp->b_objlock);
 1155                         mutex_enter(&vp->v_interlock);
 1156                         vp->v_numoutput++;      /* put it on swapdev */
 1157                         mutex_exit(&vp->v_interlock);
 1158                 }
 1159 
 1160                 /*
 1161                  * finally plug in swapdev vnode and start I/O
 1162                  */
 1163                 bp->b_vp = vp;
 1164                 bp->b_objlock = &vp->v_interlock;
 1165                 VOP_STRATEGY(vp, bp);
 1166                 return;
 1167 
 1168         case VREG:
 1169                 /*
 1170                  * delegate to sw_reg_strategy function.
 1171                  */
 1172                 sw_reg_strategy(sdp, bp, bn);
 1173                 return;
 1174         }
 1175         /* NOTREACHED */
 1176 }
 1177 
 1178 /*
 1179  * swread: the read function for the drum (just a call to physio)
 1180  */
 1181 /*ARGSUSED*/
 1182 static int
 1183 swread(dev_t dev, struct uio *uio, int ioflag)
 1184 {
 1185         UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
 1186 
 1187         UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
 1188         return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
 1189 }
 1190 
 1191 /*
 1192  * swwrite: the write function for the drum (just a call to physio)
 1193  */
 1194 /*ARGSUSED*/
 1195 static int
 1196 swwrite(dev_t dev, struct uio *uio, int ioflag)
 1197 {
 1198         UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
 1199 
 1200         UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
 1201         return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
 1202 }
 1203 
 1204 const struct bdevsw swap_bdevsw = {
 1205         nullopen, nullclose, swstrategy, noioctl, nodump, nosize, D_OTHER,
 1206 };
 1207 
 1208 const struct cdevsw swap_cdevsw = {
 1209         nullopen, nullclose, swread, swwrite, noioctl,
 1210         nostop, notty, nopoll, nommap, nokqfilter, D_OTHER,
 1211 };
 1212 
 1213 /*
 1214  * sw_reg_strategy: handle swap i/o to regular files
 1215  */
 1216 static void
 1217 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
 1218 {
 1219         struct vnode    *vp;
 1220         struct vndxfer  *vnx;
 1221         daddr_t         nbn;
 1222         char            *addr;
 1223         off_t           byteoff;
 1224         int             s, off, nra, error, sz, resid;
 1225         UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
 1226 
 1227         /*
 1228          * allocate a vndxfer head for this transfer and point it to
 1229          * our buffer.
 1230          */
 1231         vnx = pool_get(&vndxfer_pool, PR_WAITOK);
 1232         vnx->vx_flags = VX_BUSY;
 1233         vnx->vx_error = 0;
 1234         vnx->vx_pending = 0;
 1235         vnx->vx_bp = bp;
 1236         vnx->vx_sdp = sdp;
 1237 
 1238         /*
 1239          * setup for main loop where we read filesystem blocks into
 1240          * our buffer.
 1241          */
 1242         error = 0;
 1243         bp->b_resid = bp->b_bcount;     /* nothing transfered yet! */
 1244         addr = bp->b_data;              /* current position in buffer */
 1245         byteoff = dbtob((uint64_t)bn);
 1246 
 1247         for (resid = bp->b_resid; resid; resid -= sz) {
 1248                 struct vndbuf   *nbp;
 1249 
 1250                 /*
 1251                  * translate byteoffset into block number.  return values:
 1252                  *   vp = vnode of underlying device
 1253                  *  nbn = new block number (on underlying vnode dev)
 1254                  *  nra = num blocks we can read-ahead (excludes requested
 1255                  *      block)
 1256                  */
 1257                 nra = 0;
 1258                 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
 1259                                         &vp, &nbn, &nra);
 1260 
 1261                 if (error == 0 && nbn == (daddr_t)-1) {
 1262                         /*
 1263                          * this used to just set error, but that doesn't
 1264                          * do the right thing.  Instead, it causes random
 1265                          * memory errors.  The panic() should remain until
 1266                          * this condition doesn't destabilize the system.
 1267                          */
 1268 #if 1
 1269                         panic("sw_reg_strategy: swap to sparse file");
 1270 #else
 1271                         error = EIO;    /* failure */
 1272 #endif
 1273                 }
 1274 
 1275                 /*
 1276                  * punt if there was an error or a hole in the file.
 1277                  * we must wait for any i/o ops we have already started
 1278                  * to finish before returning.
 1279                  *
 1280                  * XXX we could deal with holes here but it would be
 1281                  * a hassle (in the write case).
 1282                  */
 1283                 if (error) {
 1284                         s = splbio();
 1285                         vnx->vx_error = error;  /* pass error up */
 1286                         goto out;
 1287                 }
 1288 
 1289                 /*
 1290                  * compute the size ("sz") of this transfer (in bytes).
 1291                  */
 1292                 off = byteoff % sdp->swd_bsize;
 1293                 sz = (1 + nra) * sdp->swd_bsize - off;
 1294                 if (sz > resid)
 1295                         sz = resid;
 1296 
 1297                 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
 1298                             "vp %p/%p offset 0x%x/0x%x",
 1299                             sdp->swd_vp, vp, byteoff, nbn);
 1300 
 1301                 /*
 1302                  * now get a buf structure.   note that the vb_buf is
 1303                  * at the front of the nbp structure so that you can
 1304                  * cast pointers between the two structure easily.
 1305                  */
 1306                 nbp = pool_get(&vndbuf_pool, PR_WAITOK);
 1307                 buf_init(&nbp->vb_buf);
 1308                 nbp->vb_buf.b_flags    = bp->b_flags;
 1309                 nbp->vb_buf.b_cflags   = bp->b_cflags;
 1310                 nbp->vb_buf.b_oflags   = bp->b_oflags;
 1311                 nbp->vb_buf.b_bcount   = sz;
 1312                 nbp->vb_buf.b_bufsize  = sz;
 1313                 nbp->vb_buf.b_error    = 0;
 1314                 nbp->vb_buf.b_data     = addr;
 1315                 nbp->vb_buf.b_lblkno   = 0;
 1316                 nbp->vb_buf.b_blkno    = nbn + btodb(off);
 1317                 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
 1318                 nbp->vb_buf.b_iodone   = sw_reg_biodone;
 1319                 nbp->vb_buf.b_vp       = vp;
 1320                 nbp->vb_buf.b_objlock  = &vp->v_interlock;
 1321                 if (vp->v_type == VBLK) {
 1322                         nbp->vb_buf.b_dev = vp->v_rdev;
 1323                 }
 1324 
 1325                 nbp->vb_xfer = vnx;     /* patch it back in to vnx */
 1326 
 1327                 /*
 1328                  * Just sort by block number
 1329                  */
 1330                 s = splbio();
 1331                 if (vnx->vx_error != 0) {
 1332                         buf_destroy(&nbp->vb_buf);
 1333                         pool_put(&vndbuf_pool, nbp);
 1334                         goto out;
 1335                 }
 1336                 vnx->vx_pending++;
 1337 
 1338                 /* sort it in and start I/O if we are not over our limit */
 1339                 /* XXXAD locking */
 1340                 BUFQ_PUT(sdp->swd_tab, &nbp->vb_buf);
 1341                 sw_reg_start(sdp);
 1342                 splx(s);
 1343 
 1344                 /*
 1345                  * advance to the next I/O
 1346                  */
 1347                 byteoff += sz;
 1348                 addr += sz;
 1349         }
 1350 
 1351         s = splbio();
 1352 
 1353 out: /* Arrive here at splbio */
 1354         vnx->vx_flags &= ~VX_BUSY;
 1355         if (vnx->vx_pending == 0) {
 1356                 error = vnx->vx_error;
 1357                 pool_put(&vndxfer_pool, vnx);
 1358                 bp->b_error = error;
 1359                 biodone(bp);
 1360         }
 1361         splx(s);
 1362 }
 1363 
 1364 /*
 1365  * sw_reg_start: start an I/O request on the requested swapdev
 1366  *
 1367  * => reqs are sorted by b_rawblkno (above)
 1368  */
 1369 static void
 1370 sw_reg_start(struct swapdev *sdp)
 1371 {
 1372         struct buf      *bp;
 1373         struct vnode    *vp;
 1374         UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
 1375 
 1376         /* recursion control */
 1377         if ((sdp->swd_flags & SWF_BUSY) != 0)
 1378                 return;
 1379 
 1380         sdp->swd_flags |= SWF_BUSY;
 1381 
 1382         while (sdp->swd_active < sdp->swd_maxactive) {
 1383                 bp = BUFQ_GET(sdp->swd_tab);
 1384                 if (bp == NULL)
 1385                         break;
 1386                 sdp->swd_active++;
 1387 
 1388                 UVMHIST_LOG(pdhist,
 1389                     "sw_reg_start:  bp %p vp %p blkno %p cnt %lx",
 1390                     bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
 1391                 vp = bp->b_vp;
 1392                 KASSERT(bp->b_objlock == &vp->v_interlock);
 1393                 if ((bp->b_flags & B_READ) == 0) {
 1394                         mutex_enter(&vp->v_interlock);
 1395                         vp->v_numoutput++;
 1396                         mutex_exit(&vp->v_interlock);
 1397                 }
 1398                 VOP_STRATEGY(vp, bp);
 1399         }
 1400         sdp->swd_flags &= ~SWF_BUSY;
 1401 }
 1402 
 1403 /*
 1404  * sw_reg_biodone: one of our i/o's has completed
 1405  */
 1406 static void
 1407 sw_reg_biodone(struct buf *bp)
 1408 {
 1409         workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
 1410 }
 1411 
 1412 /*
 1413  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
 1414  *
 1415  * => note that we can recover the vndbuf struct by casting the buf ptr
 1416  */
 1417 static void
 1418 sw_reg_iodone(struct work *wk, void *dummy)
 1419 {
 1420         struct vndbuf *vbp = (void *)wk;
 1421         struct vndxfer *vnx = vbp->vb_xfer;
 1422         struct buf *pbp = vnx->vx_bp;           /* parent buffer */
 1423         struct swapdev  *sdp = vnx->vx_sdp;
 1424         int s, resid, error;
 1425         KASSERT(&vbp->vb_buf.b_work == wk);
 1426         UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
 1427 
 1428         UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=%x addr=%p",
 1429             vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
 1430         UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
 1431             vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
 1432 
 1433         /*
 1434          * protect vbp at splbio and update.
 1435          */
 1436 
 1437         s = splbio();
 1438         resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
 1439         pbp->b_resid -= resid;
 1440         vnx->vx_pending--;
 1441 
 1442         if (vbp->vb_buf.b_error != 0) {
 1443                 /* pass error upward */
 1444                 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
 1445                 UVMHIST_LOG(pdhist, "  got error=%d !", error, 0, 0, 0);
 1446                 vnx->vx_error = error;
 1447         }
 1448 
 1449         /*
 1450          * kill vbp structure
 1451          */
 1452         buf_destroy(&vbp->vb_buf);
 1453         pool_put(&vndbuf_pool, vbp);
 1454 
 1455         /*
 1456          * wrap up this transaction if it has run to completion or, in
 1457          * case of an error, when all auxiliary buffers have returned.
 1458          */
 1459         if (vnx->vx_error != 0) {
 1460                 /* pass error upward */
 1461                 error = vnx->vx_error;
 1462                 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
 1463                         pbp->b_error = error;
 1464                         biodone(pbp);
 1465                         pool_put(&vndxfer_pool, vnx);
 1466                 }
 1467         } else if (pbp->b_resid == 0) {
 1468                 KASSERT(vnx->vx_pending == 0);
 1469                 if ((vnx->vx_flags & VX_BUSY) == 0) {
 1470                         UVMHIST_LOG(pdhist, "  iodone error=%d !",
 1471                             pbp, vnx->vx_error, 0, 0);
 1472                         biodone(pbp);
 1473                         pool_put(&vndxfer_pool, vnx);
 1474                 }
 1475         }
 1476 
 1477         /*
 1478          * done!   start next swapdev I/O if one is pending
 1479          */
 1480         sdp->swd_active--;
 1481         sw_reg_start(sdp);
 1482         splx(s);
 1483 }
 1484 
 1485 
 1486 /*
 1487  * uvm_swap_alloc: allocate space on swap
 1488  *
 1489  * => allocation is done "round robin" down the priority list, as we
 1490  *      allocate in a priority we "rotate" the circle queue.
 1491  * => space can be freed with uvm_swap_free
 1492  * => we return the page slot number in /dev/drum (0 == invalid slot)
 1493  * => we lock uvm_swap_data_lock
 1494  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
 1495  */
 1496 int
 1497 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
 1498 {
 1499         struct swapdev *sdp;
 1500         struct swappri *spp;
 1501         UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
 1502 
 1503         /*
 1504          * no swap devices configured yet?   definite failure.
 1505          */
 1506         if (uvmexp.nswapdev < 1)
 1507                 return 0;
 1508 
 1509         /*
 1510          * lock data lock, convert slots into blocks, and enter loop
 1511          */
 1512         mutex_enter(&uvm_swap_data_lock);
 1513 
 1514 ReTry:  /* XXXMRG */
 1515         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
 1516                 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
 1517                         uint64_t result;
 1518 
 1519                         /* if it's not enabled, then we can't swap from it */
 1520                         if ((sdp->swd_flags & SWF_ENABLE) == 0)
 1521                                 continue;
 1522                         if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
 1523                                 continue;
 1524                         result = blist_alloc(sdp->swd_blist, *nslots);
 1525                         if (result == BLIST_NONE) {
 1526                                 continue;
 1527                         }
 1528                         KASSERT(result < sdp->swd_drumsize);
 1529 
 1530                         /*
 1531                          * successful allocation!  now rotate the circleq.
 1532                          */
 1533                         CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
 1534                         CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
 1535                         sdp->swd_npginuse += *nslots;
 1536                         uvmexp.swpginuse += *nslots;
 1537                         mutex_exit(&uvm_swap_data_lock);
 1538                         /* done!  return drum slot number */
 1539                         UVMHIST_LOG(pdhist,
 1540                             "success!  returning %d slots starting at %d",
 1541                             *nslots, result + sdp->swd_drumoffset, 0, 0);
 1542                         return (result + sdp->swd_drumoffset);
 1543                 }
 1544         }
 1545 
 1546         /* XXXMRG: BEGIN HACK */
 1547         if (*nslots > 1 && lessok) {
 1548                 *nslots = 1;
 1549                 /* XXXMRG: ugh!  blist should support this for us */
 1550                 goto ReTry;
 1551         }
 1552         /* XXXMRG: END HACK */
 1553 
 1554         mutex_exit(&uvm_swap_data_lock);
 1555         return 0;
 1556 }
 1557 
 1558 /*
 1559  * uvm_swapisfull: return true if most of available swap is allocated
 1560  * and in use.  we don't count some small portion as it may be inaccessible
 1561  * to us at any given moment, for example if there is lock contention or if
 1562  * pages are busy.
 1563  */
 1564 bool
 1565 uvm_swapisfull(void)
 1566 {
 1567         int swpgonly;
 1568         bool rv;
 1569 
 1570         mutex_enter(&uvm_swap_data_lock);
 1571         KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
 1572         swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
 1573             uvm_swapisfull_factor);
 1574         rv = (swpgonly >= uvmexp.swpgavail);
 1575         mutex_exit(&uvm_swap_data_lock);
 1576 
 1577         return (rv);
 1578 }
 1579 
 1580 /*
 1581  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
 1582  *
 1583  * => we lock uvm_swap_data_lock
 1584  */
 1585 void
 1586 uvm_swap_markbad(int startslot, int nslots)
 1587 {
 1588         struct swapdev *sdp;
 1589         UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
 1590 
 1591         mutex_enter(&uvm_swap_data_lock);
 1592         sdp = swapdrum_getsdp(startslot);
 1593         KASSERT(sdp != NULL);
 1594 
 1595         /*
 1596          * we just keep track of how many pages have been marked bad
 1597          * in this device, to make everything add up in swap_off().
 1598          * we assume here that the range of slots will all be within
 1599          * one swap device.
 1600          */
 1601 
 1602         KASSERT(uvmexp.swpgonly >= nslots);
 1603         uvmexp.swpgonly -= nslots;
 1604         sdp->swd_npgbad += nslots;
 1605         UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0);
 1606         mutex_exit(&uvm_swap_data_lock);
 1607 }
 1608 
 1609 /*
 1610  * uvm_swap_free: free swap slots
 1611  *
 1612  * => this can be all or part of an allocation made by uvm_swap_alloc
 1613  * => we lock uvm_swap_data_lock
 1614  */
 1615 void
 1616 uvm_swap_free(int startslot, int nslots)
 1617 {
 1618         struct swapdev *sdp;
 1619         UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
 1620 
 1621         UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
 1622             startslot, 0, 0);
 1623 
 1624         /*
 1625          * ignore attempts to free the "bad" slot.
 1626          */
 1627 
 1628         if (startslot == SWSLOT_BAD) {
 1629                 return;
 1630         }
 1631 
 1632         /*
 1633          * convert drum slot offset back to sdp, free the blocks
 1634          * in the extent, and return.   must hold pri lock to do
 1635          * lookup and access the extent.
 1636          */
 1637 
 1638         mutex_enter(&uvm_swap_data_lock);
 1639         sdp = swapdrum_getsdp(startslot);
 1640         KASSERT(uvmexp.nswapdev >= 1);
 1641         KASSERT(sdp != NULL);
 1642         KASSERT(sdp->swd_npginuse >= nslots);
 1643         blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
 1644         sdp->swd_npginuse -= nslots;
 1645         uvmexp.swpginuse -= nslots;
 1646         mutex_exit(&uvm_swap_data_lock);
 1647 }
 1648 
 1649 /*
 1650  * uvm_swap_put: put any number of pages into a contig place on swap
 1651  *
 1652  * => can be sync or async
 1653  */
 1654 
 1655 int
 1656 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
 1657 {
 1658         int error;
 1659 
 1660         error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
 1661             ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
 1662         return error;
 1663 }
 1664 
 1665 /*
 1666  * uvm_swap_get: get a single page from swap
 1667  *
 1668  * => usually a sync op (from fault)
 1669  */
 1670 
 1671 int
 1672 uvm_swap_get(struct vm_page *page, int swslot, int flags)
 1673 {
 1674         int error;
 1675 
 1676         uvmexp.nswget++;
 1677         KASSERT(flags & PGO_SYNCIO);
 1678         if (swslot == SWSLOT_BAD) {
 1679                 return EIO;
 1680         }
 1681 
 1682         error = uvm_swap_io(&page, swslot, 1, B_READ |
 1683             ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
 1684         if (error == 0) {
 1685 
 1686                 /*
 1687                  * this page is no longer only in swap.
 1688                  */
 1689 
 1690                 mutex_enter(&uvm_swap_data_lock);
 1691                 KASSERT(uvmexp.swpgonly > 0);
 1692                 uvmexp.swpgonly--;
 1693                 mutex_exit(&uvm_swap_data_lock);
 1694         }
 1695         return error;
 1696 }
 1697 
 1698 /*
 1699  * uvm_swap_io: do an i/o operation to swap
 1700  */
 1701 
 1702 static int
 1703 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
 1704 {
 1705         daddr_t startblk;
 1706         struct  buf *bp;
 1707         vaddr_t kva;
 1708         int     error, mapinflags;
 1709         bool write, async;
 1710         UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
 1711 
 1712         UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
 1713             startslot, npages, flags, 0);
 1714 
 1715         write = (flags & B_READ) == 0;
 1716         async = (flags & B_ASYNC) != 0;
 1717 
 1718         /*
 1719          * allocate a buf for the i/o.
 1720          */
 1721 
 1722         KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
 1723         bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
 1724         if (bp == NULL) {
 1725                 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
 1726                 return ENOMEM;
 1727         }
 1728 
 1729         /*
 1730          * convert starting drum slot to block number
 1731          */
 1732 
 1733         startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
 1734 
 1735         /*
 1736          * first, map the pages into the kernel.
 1737          */
 1738 
 1739         mapinflags = !write ?
 1740                 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
 1741                 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
 1742         kva = uvm_pagermapin(pps, npages, mapinflags);
 1743 
 1744         /*
 1745          * fill in the bp/sbp.   we currently route our i/o through
 1746          * /dev/drum's vnode [swapdev_vp].
 1747          */
 1748 
 1749         bp->b_cflags = BC_BUSY | BC_NOCACHE;
 1750         bp->b_flags = (flags & (B_READ|B_ASYNC));
 1751         bp->b_proc = &proc0;    /* XXX */
 1752         bp->b_vnbufs.le_next = NOLIST;
 1753         bp->b_data = (void *)kva;
 1754         bp->b_blkno = startblk;
 1755         bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
 1756 
 1757         /*
 1758          * bump v_numoutput (counter of number of active outputs).
 1759          */
 1760 
 1761         if (write) {
 1762                 mutex_enter(&swapdev_vp->v_interlock);
 1763                 swapdev_vp->v_numoutput++;
 1764                 mutex_exit(&swapdev_vp->v_interlock);
 1765         }
 1766 
 1767         /*
 1768          * for async ops we must set up the iodone handler.
 1769          */
 1770 
 1771         if (async) {
 1772                 bp->b_iodone = uvm_aio_biodone;
 1773                 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
 1774                 if (curlwp == uvm.pagedaemon_lwp)
 1775                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
 1776                 else
 1777                         BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
 1778         } else {
 1779                 bp->b_iodone = NULL;
 1780                 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
 1781         }
 1782         UVMHIST_LOG(pdhist,
 1783             "about to start io: data = %p blkno = 0x%x, bcount = %ld",
 1784             bp->b_data, bp->b_blkno, bp->b_bcount, 0);
 1785 
 1786         /*
 1787          * now we start the I/O, and if async, return.
 1788          */
 1789 
 1790         VOP_STRATEGY(swapdev_vp, bp);
 1791         if (async)
 1792                 return 0;
 1793 
 1794         /*
 1795          * must be sync i/o.   wait for it to finish
 1796          */
 1797 
 1798         error = biowait(bp);
 1799 
 1800         /*
 1801          * kill the pager mapping
 1802          */
 1803 
 1804         uvm_pagermapout(kva, npages);
 1805 
 1806         /*
 1807          * now dispose of the buf and we're done.
 1808          */
 1809 
 1810         if (write) {
 1811                 mutex_enter(&swapdev_vp->v_interlock);
 1812                 vwakeup(bp);
 1813                 mutex_exit(&swapdev_vp->v_interlock);
 1814         }
 1815         putiobuf(bp);
 1816         UVMHIST_LOG(pdhist, "<- done (sync)  error=%d", error, 0, 0, 0);
 1817 
 1818         return (error);
 1819 }

Cache object: ab539ea1ad32fd7a24c0d7e1cd2e728a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.