vfs_subr.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
   35  */
   36 
   37 /*
   38  * External virtual filesystem routines
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: releng/6.0/sys/kern/vfs_subr.c 151671 2005-10-25 20:43:25Z kris $");
   43 
   44 #include "opt_ddb.h"
   45 #include "opt_mac.h"
   46 
   47 #include <sys/param.h>
   48 #include <sys/systm.h>
   49 #include <sys/bio.h>
   50 #include <sys/buf.h>
   51 #include <sys/conf.h>
   52 #include <sys/dirent.h>
   53 #include <sys/event.h>
   54 #include <sys/eventhandler.h>
   55 #include <sys/extattr.h>
   56 #include <sys/file.h>
   57 #include <sys/fcntl.h>
   58 #include <sys/kdb.h>
   59 #include <sys/kernel.h>
   60 #include <sys/kthread.h>
   61 #include <sys/mac.h>
   62 #include <sys/malloc.h>
   63 #include <sys/mount.h>
   64 #include <sys/namei.h>
   65 #include <sys/reboot.h>
   66 #include <sys/sleepqueue.h>
   67 #include <sys/stat.h>
   68 #include <sys/sysctl.h>
   69 #include <sys/syslog.h>
   70 #include <sys/vmmeter.h>
   71 #include <sys/vnode.h>
   72 
   73 #include <machine/stdarg.h>
   74 
   75 #include <vm/vm.h>
   76 #include <vm/vm_object.h>
   77 #include <vm/vm_extern.h>
   78 #include <vm/pmap.h>
   79 #include <vm/vm_map.h>
   80 #include <vm/vm_page.h>
   81 #include <vm/vm_kern.h>
   82 #include <vm/uma.h>
   83 
   84 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
   85 
   86 static void     delmntque(struct vnode *vp);
   87 static void     insmntque(struct vnode *vp, struct mount *mp);
   88 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
   89                     int slpflag, int slptimeo);
   90 static void     syncer_shutdown(void *arg, int howto);
   91 static int      vtryrecycle(struct vnode *vp);
   92 static void     vbusy(struct vnode *vp);
   93 static void     vdropl(struct vnode *vp);
   94 static void     vinactive(struct vnode *, struct thread *);
   95 static void     v_incr_usecount(struct vnode *);
   96 static void     v_decr_usecount(struct vnode *);
   97 static void     v_decr_useonly(struct vnode *);
   98 static void     vfree(struct vnode *);
   99 static void     vnlru_free(int);
  100 static void     vdestroy(struct vnode *);
  101 static void     vgonel(struct vnode *);
  102 static void     vfs_knllock(void *arg);
  103 static void     vfs_knlunlock(void *arg);
  104 static int      vfs_knllocked(void *arg);
  105 
  106 
  107 /*
  108  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
  109  * build.  Without mpsafevm the buffer cache can not run Giant free.
  110  */
  111 #if defined(__alpha__) || defined(__amd64__) || defined(__i386__) || \
  112         defined(__sparc64__)
  113 int mpsafe_vfs = 1;
  114 #else
  115 int mpsafe_vfs;
  116 #endif
  117 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
  118 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
  119     "MPSAFE VFS");
  120 
  121 /*
  122  * Number of vnodes in existence.  Increased whenever getnewvnode()
  123  * allocates a new vnode, never decreased.
  124  */
  125 static unsigned long    numvnodes;
  126 
  127 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
  128 
  129 /*
  130  * Conversion tables for conversion from vnode types to inode formats
  131  * and back.
  132  */
  133 enum vtype iftovt_tab[16] = {
  134         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
  135         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
  136 };
  137 int vttoif_tab[9] = {
  138         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
  139         S_IFSOCK, S_IFIFO, S_IFMT,
  140 };
  141 
  142 /*
  143  * List of vnodes that are ready for recycling.
  144  */
  145 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
  146 
  147 /*
  148  * Free vnode target.  Free vnodes may simply be files which have been stat'd
  149  * but not read.  This is somewhat common, and a small cache of such files
  150  * should be kept to avoid recreation costs.
  151  */
  152 static u_long wantfreevnodes;
  153 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
  154 /* Number of vnodes in the free list. */
  155 static u_long freevnodes;
  156 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
  157 
  158 /*
  159  * Various variables used for debugging the new implementation of
  160  * reassignbuf().
  161  * XXX these are probably of (very) limited utility now.
  162  */
  163 static int reassignbufcalls;
  164 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
  165 
  166 /*
  167  * Cache for the mount type id assigned to NFS.  This is used for
  168  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  169  */
  170 int     nfs_mount_type = -1;
  171 
  172 /* To keep more than one thread at a time from running vfs_getnewfsid */
  173 static struct mtx mntid_mtx;
  174 
  175 /*
  176  * Lock for any access to the following:
  177  *      vnode_free_list
  178  *      numvnodes
  179  *      freevnodes
  180  */
  181 static struct mtx vnode_free_list_mtx;
  182 
  183 /* Publicly exported FS */
  184 struct nfs_public nfs_pub;
  185 
  186 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
  187 static uma_zone_t vnode_zone;
  188 static uma_zone_t vnodepoll_zone;
  189 
  190 /* Set to 1 to print out reclaim of active vnodes */
  191 int     prtactive;
  192 
  193 /*
  194  * The workitem queue.
  195  *
  196  * It is useful to delay writes of file data and filesystem metadata
  197  * for tens of seconds so that quickly created and deleted files need
  198  * not waste disk bandwidth being created and removed. To realize this,
  199  * we append vnodes to a "workitem" queue. When running with a soft
  200  * updates implementation, most pending metadata dependencies should
  201  * not wait for more than a few seconds. Thus, mounted on block devices
  202  * are delayed only about a half the time that file data is delayed.
  203  * Similarly, directory updates are more critical, so are only delayed
  204  * about a third the time that file data is delayed. Thus, there are
  205  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  206  * one each second (driven off the filesystem syncer process). The
  207  * syncer_delayno variable indicates the next queue that is to be processed.
  208  * Items that need to be processed soon are placed in this queue:
  209  *
  210  *      syncer_workitem_pending[syncer_delayno]
  211  *
  212  * A delay of fifteen seconds is done by placing the request fifteen
  213  * entries later in the queue:
  214  *
  215  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  216  *
  217  */
  218 static int syncer_delayno;
  219 static long syncer_mask;
  220 LIST_HEAD(synclist, bufobj);
  221 static struct synclist *syncer_workitem_pending;
  222 /*
  223  * The sync_mtx protects:
  224  *      bo->bo_synclist
  225  *      sync_vnode_count
  226  *      syncer_delayno
  227  *      syncer_state
  228  *      syncer_workitem_pending
  229  *      syncer_worklist_len
  230  *      rushjob
  231  */
  232 static struct mtx sync_mtx;
  233 
  234 #define SYNCER_MAXDELAY         32
  235 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
  236 static int syncdelay = 30;              /* max time to delay syncing data */
  237 static int filedelay = 30;              /* time to delay syncing files */
  238 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
  239 static int dirdelay = 29;               /* time to delay syncing directories */
  240 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
  241 static int metadelay = 28;              /* time to delay syncing metadata */
  242 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
  243 static int rushjob;             /* number of slots to run ASAP */
  244 static int stat_rush_requests;  /* number of times I/O speeded up */
  245 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
  246 
  247 /*
  248  * When shutting down the syncer, run it at four times normal speed.
  249  */
  250 #define SYNCER_SHUTDOWN_SPEEDUP         4
  251 static int sync_vnode_count;
  252 static int syncer_worklist_len;
  253 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
  254     syncer_state;
  255 
  256 /*
  257  * Number of vnodes we want to exist at any one time.  This is mostly used
  258  * to size hash tables in vnode-related code.  It is normally not used in
  259  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  260  *
  261  * XXX desiredvnodes is historical cruft and should not exist.
  262  */
  263 int desiredvnodes;
  264 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
  265     &desiredvnodes, 0, "Maximum number of vnodes");
  266 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
  267     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
  268 static int vnlru_nowhere;
  269 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
  270     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
  271 
  272 /* Hook for calling soft updates. */
  273 int (*softdep_process_worklist_hook)(struct mount *);
  274 
  275 /*
  276  * Macros to control when a vnode is freed and recycled.  All require
  277  * the vnode interlock.
  278  */
  279 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
  280 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
  281 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
  282 
  283 
  284 /*
  285  * Initialize the vnode management data structures.
  286  */
  287 #ifndef MAXVNODES_MAX
  288 #define MAXVNODES_MAX   100000
  289 #endif
  290 static void
  291 vntblinit(void *dummy __unused)
  292 {
  293 
  294         /*
  295          * Desiredvnodes is a function of the physical memory size and
  296          * the kernel's heap size.  Specifically, desiredvnodes scales
  297          * in proportion to the physical memory size until two fifths
  298          * of the kernel's heap size is consumed by vnodes and vm
  299          * objects.
  300          */
  301         desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
  302             (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
  303         if (desiredvnodes > MAXVNODES_MAX) {
  304                 if (bootverbose)
  305                         printf("Reducing kern.maxvnodes %d -> %d\n",
  306                             desiredvnodes, MAXVNODES_MAX);
  307                 desiredvnodes = MAXVNODES_MAX;
  308         }
  309         wantfreevnodes = desiredvnodes / 4; 
  310         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
  311         TAILQ_INIT(&vnode_free_list);
  312         mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
  313         vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
  314             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
  315         vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
  316               NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
  317         /*
  318          * Initialize the filesystem syncer.
  319          */
  320         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
  321                 &syncer_mask);
  322         syncer_maxdelay = syncer_mask + 1;
  323         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
  324 }
  325 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
  326 
  327 
  328 /*
  329  * Mark a mount point as busy. Used to synchronize access and to delay
  330  * unmounting. Interlock is not released on failure.
  331  */
  332 int
  333 vfs_busy(mp, flags, interlkp, td)
  334         struct mount *mp;
  335         int flags;
  336         struct mtx *interlkp;
  337         struct thread *td;
  338 {
  339         int lkflags;
  340 
  341         MNT_ILOCK(mp);
  342         if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
  343                 if (flags & LK_NOWAIT) {
  344                         MNT_IUNLOCK(mp);
  345                         return (ENOENT);
  346                 }
  347                 if (interlkp)
  348                         mtx_unlock(interlkp);
  349                 mp->mnt_kern_flag |= MNTK_MWAIT;
  350                 /*
  351                  * Since all busy locks are shared except the exclusive
  352                  * lock granted when unmounting, the only place that a
  353                  * wakeup needs to be done is at the release of the
  354                  * exclusive lock at the end of dounmount.
  355                  */
  356                 msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0);
  357                 if (interlkp)
  358                         mtx_lock(interlkp);
  359                 return (ENOENT);
  360         }
  361         if (interlkp)
  362                 mtx_unlock(interlkp);
  363         lkflags = LK_SHARED | LK_INTERLOCK;
  364         if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
  365                 panic("vfs_busy: unexpected lock failure");
  366         return (0);
  367 }
  368 
  369 /*
  370  * Free a busy filesystem.
  371  */
  372 void
  373 vfs_unbusy(mp, td)
  374         struct mount *mp;
  375         struct thread *td;
  376 {
  377 
  378         lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
  379 }
  380 
  381 /*
  382  * Lookup a mount point by filesystem identifier.
  383  */
  384 struct mount *
  385 vfs_getvfs(fsid)
  386         fsid_t *fsid;
  387 {
  388         struct mount *mp;
  389 
  390         mtx_lock(&mountlist_mtx);
  391         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  392                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
  393                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
  394                         mtx_unlock(&mountlist_mtx);
  395                         return (mp);
  396                 }
  397         }
  398         mtx_unlock(&mountlist_mtx);
  399         return ((struct mount *) 0);
  400 }
  401 
  402 /*
  403  * Check if a user can access priveledged mount options.
  404  */
  405 int
  406 vfs_suser(struct mount *mp, struct thread *td)
  407 {
  408         int error;
  409 
  410         if ((mp->mnt_flag & MNT_USER) == 0 ||
  411             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
  412                 if ((error = suser(td)) != 0)
  413                         return (error);
  414         }
  415         return (0);
  416 }
  417 
  418 /*
  419  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  420  * will be used to create fake device numbers for stat().  Also try (but
  421  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  422  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  423  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  424  *
  425  * Keep in mind that several mounts may be running in parallel.  Starting
  426  * the search one past where the previous search terminated is both a
  427  * micro-optimization and a defense against returning the same fsid to
  428  * different mounts.
  429  */
  430 void
  431 vfs_getnewfsid(mp)
  432         struct mount *mp;
  433 {
  434         static u_int16_t mntid_base;
  435         fsid_t tfsid;
  436         int mtype;
  437 
  438         mtx_lock(&mntid_mtx);
  439         mtype = mp->mnt_vfc->vfc_typenum;
  440         tfsid.val[1] = mtype;
  441         mtype = (mtype & 0xFF) << 24;
  442         for (;;) {
  443                 tfsid.val[0] = makedev(255,
  444                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
  445                 mntid_base++;
  446                 if (vfs_getvfs(&tfsid) == NULL)
  447                         break;
  448         }
  449         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
  450         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
  451         mtx_unlock(&mntid_mtx);
  452 }
  453 
  454 /*
  455  * Knob to control the precision of file timestamps:
  456  *
  457  *   0 = seconds only; nanoseconds zeroed.
  458  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  459  *   2 = seconds and nanoseconds, truncated to microseconds.
  460  * >=3 = seconds and nanoseconds, maximum precision.
  461  */
  462 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
  463 
  464 static int timestamp_precision = TSP_SEC;
  465 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
  466     &timestamp_precision, 0, "");
  467 
  468 /*
  469  * Get a current timestamp.
  470  */
  471 void
  472 vfs_timestamp(tsp)
  473         struct timespec *tsp;
  474 {
  475         struct timeval tv;
  476 
  477         switch (timestamp_precision) {
  478         case TSP_SEC:
  479                 tsp->tv_sec = time_second;
  480                 tsp->tv_nsec = 0;
  481                 break;
  482         case TSP_HZ:
  483                 getnanotime(tsp);
  484                 break;
  485         case TSP_USEC:
  486                 microtime(&tv);
  487                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
  488                 break;
  489         case TSP_NSEC:
  490         default:
  491                 nanotime(tsp);
  492                 break;
  493         }
  494 }
  495 
  496 /*
  497  * Set vnode attributes to VNOVAL
  498  */
  499 void
  500 vattr_null(vap)
  501         struct vattr *vap;
  502 {
  503 
  504         vap->va_type = VNON;
  505         vap->va_size = VNOVAL;
  506         vap->va_bytes = VNOVAL;
  507         vap->va_mode = VNOVAL;
  508         vap->va_nlink = VNOVAL;
  509         vap->va_uid = VNOVAL;
  510         vap->va_gid = VNOVAL;
  511         vap->va_fsid = VNOVAL;
  512         vap->va_fileid = VNOVAL;
  513         vap->va_blocksize = VNOVAL;
  514         vap->va_rdev = VNOVAL;
  515         vap->va_atime.tv_sec = VNOVAL;
  516         vap->va_atime.tv_nsec = VNOVAL;
  517         vap->va_mtime.tv_sec = VNOVAL;
  518         vap->va_mtime.tv_nsec = VNOVAL;
  519         vap->va_ctime.tv_sec = VNOVAL;
  520         vap->va_ctime.tv_nsec = VNOVAL;
  521         vap->va_birthtime.tv_sec = VNOVAL;
  522         vap->va_birthtime.tv_nsec = VNOVAL;
  523         vap->va_flags = VNOVAL;
  524         vap->va_gen = VNOVAL;
  525         vap->va_vaflags = 0;
  526 }
  527 
  528 /*
  529  * This routine is called when we have too many vnodes.  It attempts
  530  * to free <count> vnodes and will potentially free vnodes that still
  531  * have VM backing store (VM backing store is typically the cause
  532  * of a vnode blowout so we want to do this).  Therefore, this operation
  533  * is not considered cheap.
  534  *
  535  * A number of conditions may prevent a vnode from being reclaimed.
  536  * the buffer cache may have references on the vnode, a directory
  537  * vnode may still have references due to the namei cache representing
  538  * underlying files, or the vnode may be in active use.   It is not
  539  * desireable to reuse such vnodes.  These conditions may cause the
  540  * number of vnodes to reach some minimum value regardless of what
  541  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  542  */
  543 static int
  544 vlrureclaim(struct mount *mp)
  545 {
  546         struct thread *td;
  547         struct vnode *vp;
  548         int done;
  549         int trigger;
  550         int usevnodes;
  551         int count;
  552 
  553         /*
  554          * Calculate the trigger point, don't allow user
  555          * screwups to blow us up.   This prevents us from
  556          * recycling vnodes with lots of resident pages.  We
  557          * aren't trying to free memory, we are trying to
  558          * free vnodes.
  559          */
  560         usevnodes = desiredvnodes;
  561         if (usevnodes <= 0)
  562                 usevnodes = 1;
  563         trigger = cnt.v_page_count * 2 / usevnodes;
  564         done = 0;
  565         td = curthread;
  566         vn_start_write(NULL, &mp, V_WAIT);
  567         MNT_ILOCK(mp);
  568         count = mp->mnt_nvnodelistsize / 10 + 1;
  569         while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
  570                 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
  571                 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
  572                 --count;
  573                 if (!VI_TRYLOCK(vp))
  574                         goto next_iter;
  575                 /*
  576                  * If it's been deconstructed already, it's still
  577                  * referenced, or it exceeds the trigger, skip it.
  578                  */
  579                 if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
  580                     (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
  581                     vp->v_object->resident_page_count > trigger)) {
  582                         VI_UNLOCK(vp);
  583                         goto next_iter;
  584                 }
  585                 MNT_IUNLOCK(mp);
  586                 vholdl(vp);
  587                 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) {
  588                         vdrop(vp);
  589                         goto next_iter_mntunlocked;
  590                 }
  591                 VI_LOCK(vp);
  592                 /*
  593                  * v_usecount may have been bumped after VOP_LOCK() dropped
  594                  * the vnode interlock and before it was locked again.
  595                  *
  596                  * It is not necessary to recheck VI_DOOMED because it can
  597                  * only be set by another thread that holds both the vnode
  598                  * lock and vnode interlock.  If another thread has the
  599                  * vnode lock before we get to VOP_LOCK() and obtains the
  600                  * vnode interlock after VOP_LOCK() drops the vnode
  601                  * interlock, the other thread will be unable to drop the
  602                  * vnode lock before our VOP_LOCK() call fails.
  603                  */
  604                 if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
  605                     (vp->v_object != NULL && 
  606                     vp->v_object->resident_page_count > trigger)) {
  607                         VOP_UNLOCK(vp, LK_INTERLOCK, td);
  608                         goto next_iter_mntunlocked;
  609                 }
  610                 KASSERT((vp->v_iflag & VI_DOOMED) == 0,
  611                     ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
  612                 vgonel(vp);
  613                 VOP_UNLOCK(vp, 0, td);
  614                 vdropl(vp);
  615                 done++;
  616 next_iter_mntunlocked:
  617                 if ((count % 256) != 0)
  618                         goto relock_mnt;
  619                 goto yield;
  620 next_iter:
  621                 if ((count % 256) != 0)
  622                         continue;
  623                 MNT_IUNLOCK(mp);
  624 yield:
  625                 uio_yield();
  626 relock_mnt:
  627                 MNT_ILOCK(mp);
  628         }
  629         MNT_IUNLOCK(mp);
  630         vn_finished_write(mp);
  631         return done;
  632 }
  633 
  634 /*
  635  * Attempt to keep the free list at wantfreevnodes length.
  636  */
  637 static void
  638 vnlru_free(int count)
  639 {
  640         struct vnode *vp;
  641 
  642         mtx_assert(&vnode_free_list_mtx, MA_OWNED);
  643         for (; count > 0; count--) {
  644                 vp = TAILQ_FIRST(&vnode_free_list);
  645                 /*
  646                  * The list can be modified while the free_list_mtx
  647                  * has been dropped and vp could be NULL here.
  648                  */
  649                 if (!vp)
  650                         break;
  651                 VNASSERT(vp->v_op != NULL, vp,
  652                     ("vnlru_free: vnode already reclaimed."));
  653                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
  654                 /*
  655                  * Don't recycle if we can't get the interlock.
  656                  */
  657                 if (!VI_TRYLOCK(vp)) {
  658                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
  659                         continue;
  660                 }
  661                 VNASSERT(VCANRECYCLE(vp), vp,
  662                     ("vp inconsistent on freelist"));
  663                 freevnodes--;
  664                 vp->v_iflag &= ~VI_FREE;
  665                 vholdl(vp);
  666                 mtx_unlock(&vnode_free_list_mtx);
  667                 VI_UNLOCK(vp);
  668                 vtryrecycle(vp);
  669                 /*
  670                  * If the recycled succeeded this vdrop will actually free
  671                  * the vnode.  If not it will simply place it back on
  672                  * the free list.
  673                  */
  674                 vdrop(vp);
  675                 mtx_lock(&vnode_free_list_mtx);
  676         }
  677 }
  678 /*
  679  * Attempt to recycle vnodes in a context that is always safe to block.
  680  * Calling vlrurecycle() from the bowels of filesystem code has some
  681  * interesting deadlock problems.
  682  */
  683 static struct proc *vnlruproc;
  684 static int vnlruproc_sig;
  685 
  686 static void
  687 vnlru_proc(void)
  688 {
  689         struct mount *mp, *nmp;
  690         int done;
  691         struct proc *p = vnlruproc;
  692         struct thread *td = FIRST_THREAD_IN_PROC(p);
  693 
  694         mtx_lock(&Giant);
  695 
  696         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
  697             SHUTDOWN_PRI_FIRST);
  698 
  699         for (;;) {
  700                 kthread_suspend_check(p);
  701                 mtx_lock(&vnode_free_list_mtx);
  702                 if (freevnodes > wantfreevnodes)
  703                         vnlru_free(freevnodes - wantfreevnodes);
  704                 if (numvnodes <= desiredvnodes * 9 / 10) {
  705                         vnlruproc_sig = 0;
  706                         wakeup(&vnlruproc_sig);
  707                         msleep(vnlruproc, &vnode_free_list_mtx,
  708                             PVFS|PDROP, "vlruwt", hz);
  709                         continue;
  710                 }
  711                 mtx_unlock(&vnode_free_list_mtx);
  712                 done = 0;
  713                 mtx_lock(&mountlist_mtx);
  714                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
  715                         int vfsunlocked;
  716                         if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
  717                                 nmp = TAILQ_NEXT(mp, mnt_list);
  718                                 continue;
  719                         }
  720                         if (!VFS_NEEDSGIANT(mp)) {
  721                                 mtx_unlock(&Giant);
  722                                 vfsunlocked = 1;
  723                         } else
  724                                 vfsunlocked = 0;
  725                         done += vlrureclaim(mp);
  726                         if (vfsunlocked)
  727                                 mtx_lock(&Giant);
  728                         mtx_lock(&mountlist_mtx);
  729                         nmp = TAILQ_NEXT(mp, mnt_list);
  730                         vfs_unbusy(mp, td);
  731                 }
  732                 mtx_unlock(&mountlist_mtx);
  733                 if (done == 0) {
  734 #if 0
  735                         /* These messages are temporary debugging aids */
  736                         if (vnlru_nowhere < 5)
  737                                 printf("vnlru process getting nowhere..\n");
  738                         else if (vnlru_nowhere == 5)
  739                                 printf("vnlru process messages stopped.\n");
  740 #endif
  741                         vnlru_nowhere++;
  742                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
  743                 } else 
  744                         uio_yield();
  745         }
  746 }
  747 
  748 static struct kproc_desc vnlru_kp = {
  749         "vnlru",
  750         vnlru_proc,
  751         &vnlruproc
  752 };
  753 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
  754 
  755 /*
  756  * Routines having to do with the management of the vnode table.
  757  */
  758 
  759 static void
  760 vdestroy(struct vnode *vp)
  761 {
  762         struct bufobj *bo;
  763 
  764         CTR1(KTR_VFS, "vdestroy vp %p", vp);
  765         mtx_lock(&vnode_free_list_mtx);
  766         numvnodes--;
  767         mtx_unlock(&vnode_free_list_mtx);
  768         bo = &vp->v_bufobj;
  769         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
  770             ("cleaned vnode still on the free list."));
  771         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
  772         VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
  773         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
  774         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
  775         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
  776         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
  777         VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
  778         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
  779         VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
  780         VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
  781         VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
  782 #ifdef MAC
  783         mac_destroy_vnode(vp);
  784 #endif
  785         if (vp->v_pollinfo != NULL) {
  786                 knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
  787                 mtx_destroy(&vp->v_pollinfo->vpi_lock);
  788                 uma_zfree(vnodepoll_zone, vp->v_pollinfo);
  789         }
  790 #ifdef INVARIANTS
  791         /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
  792         vp->v_op = NULL;
  793 #endif
  794         lockdestroy(vp->v_vnlock);
  795         mtx_destroy(&vp->v_interlock);
  796         uma_zfree(vnode_zone, vp);
  797 }
  798 
  799 /*
  800  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  801  * before we actually vgone().  This function must be called with the vnode
  802  * held to prevent the vnode from being returned to the free list midway
  803  * through vgone().
  804  */
  805 static int
  806 vtryrecycle(struct vnode *vp)
  807 {
  808         struct thread *td = curthread;
  809         struct mount *vnmp;
  810 
  811         CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
  812         VNASSERT(vp->v_holdcnt, vp,
  813             ("vtryrecycle: Recycling vp %p without a reference.", vp));
  814         /*
  815          * This vnode may found and locked via some other list, if so we
  816          * can't recycle it yet.
  817          */
  818         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
  819                 return (EWOULDBLOCK);
  820         /*
  821          * Don't recycle if its filesystem is being suspended.
  822          */
  823         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
  824                 VOP_UNLOCK(vp, 0, td);
  825                 return (EBUSY);
  826         }
  827         /*
  828          * If we got this far, we need to acquire the interlock and see if
  829          * anyone picked up this vnode from another list.  If not, we will
  830          * mark it with DOOMED via vgonel() so that anyone who does find it
  831          * will skip over it.
  832          */
  833         VI_LOCK(vp);
  834         if (vp->v_usecount) {
  835                 VOP_UNLOCK(vp, LK_INTERLOCK, td);
  836                 vn_finished_write(vnmp);
  837                 return (EBUSY);
  838         }
  839         if ((vp->v_iflag & VI_DOOMED) == 0)
  840                 vgonel(vp);
  841         VOP_UNLOCK(vp, LK_INTERLOCK, td);
  842         vn_finished_write(vnmp);
  843         CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
  844         return (0);
  845 }
  846 
  847 /*
  848  * Return the next vnode from the free list.
  849  */
  850 int
  851 getnewvnode(tag, mp, vops, vpp)
  852         const char *tag;
  853         struct mount *mp;
  854         struct vop_vector *vops;
  855         struct vnode **vpp;
  856 {
  857         struct vnode *vp = NULL;
  858         struct bufobj *bo;
  859 
  860         mtx_lock(&vnode_free_list_mtx);
  861         /*
  862          * Lend our context to reclaim vnodes if they've exceeded the max.
  863          */
  864         if (freevnodes > wantfreevnodes)
  865                 vnlru_free(1);
  866         /*
  867          * Wait for available vnodes.
  868          */
  869         if (numvnodes > desiredvnodes) {
  870                 if (vnlruproc_sig == 0) {
  871                         vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
  872                         wakeup(vnlruproc);
  873                 }
  874                 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
  875                     "vlruwk", hz);
  876 #if 0   /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
  877                 if (numvnodes > desiredvnodes) {
  878                         mtx_unlock(&vnode_free_list_mtx);
  879                         return (ENFILE);
  880                 }
  881 #endif
  882         }
  883         numvnodes++;
  884         mtx_unlock(&vnode_free_list_mtx);
  885         vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
  886         /*
  887          * Setup locks.
  888          */
  889         vp->v_vnlock = &vp->v_lock;
  890         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
  891         /*
  892          * By default, don't allow shared locks unless filesystems
  893          * opt-in.
  894          */
  895         lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
  896         /*
  897          * Initialize bufobj.
  898          */
  899         bo = &vp->v_bufobj;
  900         bo->__bo_vnode = vp;
  901         bo->bo_mtx = &vp->v_interlock;
  902         bo->bo_ops = &buf_ops_bio;
  903         bo->bo_private = vp;
  904         TAILQ_INIT(&bo->bo_clean.bv_hd);
  905         TAILQ_INIT(&bo->bo_dirty.bv_hd);
  906         /*
  907          * Initialize namecache.
  908          */
  909         LIST_INIT(&vp->v_cache_src);
  910         TAILQ_INIT(&vp->v_cache_dst);
  911         /*
  912          * Finalize various vnode identity bits.
  913          */
  914         vp->v_type = VNON;
  915         vp->v_tag = tag;
  916         vp->v_op = vops;
  917         v_incr_usecount(vp);
  918         vp->v_data = 0;
  919 #ifdef MAC
  920         mac_init_vnode(vp);
  921         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
  922                 mac_associate_vnode_singlelabel(mp, vp);
  923         else if (mp == NULL)
  924                 printf("NULL mp in getnewvnode()\n");
  925 #endif
  926         delmntque(vp);
  927         if (mp != NULL) {
  928                 insmntque(vp, mp);
  929                 bo->bo_bsize = mp->mnt_stat.f_iosize;
  930                 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
  931                         vp->v_vflag |= VV_NOKNOTE;
  932         }
  933 
  934         CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
  935         *vpp = vp;
  936         return (0);
  937 }
  938 
  939 /*
  940  * Delete from old mount point vnode list, if on one.
  941  */
  942 static void
  943 delmntque(struct vnode *vp)
  944 {
  945         struct mount *mp;
  946 
  947         if (vp->v_mount == NULL)
  948                 return;
  949         mp = vp->v_mount;
  950         MNT_ILOCK(mp);
  951         vp->v_mount = NULL;
  952         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
  953                 ("bad mount point vnode list size"));
  954         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
  955         mp->mnt_nvnodelistsize--;
  956         MNT_IUNLOCK(mp);
  957 }
  958 
  959 /*
  960  * Insert into list of vnodes for the new mount point, if available.
  961  */
  962 static void
  963 insmntque(struct vnode *vp, struct mount *mp)
  964 {
  965 
  966         vp->v_mount = mp;
  967         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
  968         MNT_ILOCK(vp->v_mount);
  969         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
  970         mp->mnt_nvnodelistsize++;
  971         MNT_IUNLOCK(vp->v_mount);
  972 }
  973 
  974 /*
  975  * Flush out and invalidate all buffers associated with a bufobj
  976  * Called with the underlying object locked.
  977  */
  978 int
  979 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo)
  980 {
  981         int error;
  982 
  983         BO_LOCK(bo);
  984         if (flags & V_SAVE) {
  985                 error = bufobj_wwait(bo, slpflag, slptimeo);
  986                 if (error) {
  987                         BO_UNLOCK(bo);
  988                         return (error);
  989                 }
  990                 if (bo->bo_dirty.bv_cnt > 0) {
  991                         BO_UNLOCK(bo);
  992                         if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
  993                                 return (error);
  994                         /*
  995                          * XXX We could save a lock/unlock if this was only
  996                          * enabled under INVARIANTS
  997                          */
  998                         BO_LOCK(bo);
  999                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 1000                                 panic("vinvalbuf: dirty bufs");
 1001                 }
 1002         }
 1003         /*
 1004          * If you alter this loop please notice that interlock is dropped and
 1005          * reacquired in flushbuflist.  Special care is needed to ensure that
 1006          * no race conditions occur from this.
 1007          */
 1008         do {
 1009                 error = flushbuflist(&bo->bo_clean,
 1010                     flags, bo, slpflag, slptimeo);
 1011                 if (error == 0)
 1012                         error = flushbuflist(&bo->bo_dirty,
 1013                             flags, bo, slpflag, slptimeo);
 1014                 if (error != 0 && error != EAGAIN) {
 1015                         BO_UNLOCK(bo);
 1016                         return (error);
 1017                 }
 1018         } while (error != 0);
 1019 
 1020         /*
 1021          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 1022          * have write I/O in-progress but if there is a VM object then the
 1023          * VM object can also have read-I/O in-progress.
 1024          */
 1025         do {
 1026                 bufobj_wwait(bo, 0, 0);
 1027                 BO_UNLOCK(bo);
 1028                 if (bo->bo_object != NULL) {
 1029                         VM_OBJECT_LOCK(bo->bo_object);
 1030                         vm_object_pip_wait(bo->bo_object, "bovlbx");
 1031                         VM_OBJECT_UNLOCK(bo->bo_object);
 1032                 }
 1033                 BO_LOCK(bo);
 1034         } while (bo->bo_numoutput > 0);
 1035         BO_UNLOCK(bo);
 1036 
 1037         /*
 1038          * Destroy the copy in the VM cache, too.
 1039          */
 1040         if (bo->bo_object != NULL) {
 1041                 VM_OBJECT_LOCK(bo->bo_object);
 1042                 vm_object_page_remove(bo->bo_object, 0, 0,
 1043                         (flags & V_SAVE) ? TRUE : FALSE);
 1044                 VM_OBJECT_UNLOCK(bo->bo_object);
 1045         }
 1046 
 1047 #ifdef INVARIANTS
 1048         BO_LOCK(bo);
 1049         if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 1050             (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 1051                 panic("vinvalbuf: flush failed");
 1052         BO_UNLOCK(bo);
 1053 #endif
 1054         return (0);
 1055 }
 1056 
 1057 /*
 1058  * Flush out and invalidate all buffers associated with a vnode.
 1059  * Called with the underlying object locked.
 1060  */
 1061 int
 1062 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo)
 1063 {
 1064 
 1065         CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
 1066         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 1067         return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
 1068 }
 1069 
 1070 /*
 1071  * Flush out buffers on the specified list.
 1072  *
 1073  */
 1074 static int
 1075 flushbuflist(bufv, flags, bo, slpflag, slptimeo)
 1076         struct bufv *bufv;
 1077         int flags;
 1078         struct bufobj *bo;
 1079         int slpflag, slptimeo;
 1080 {
 1081         struct buf *bp, *nbp;
 1082         int retval, error;
 1083         daddr_t lblkno;
 1084         b_xflags_t xflags;
 1085 
 1086         ASSERT_BO_LOCKED(bo);
 1087 
 1088         retval = 0;
 1089         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 1090                 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 1091                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 1092                         continue;
 1093                 }
 1094                 lblkno = 0;
 1095                 xflags = 0;
 1096                 if (nbp != NULL) {
 1097                         lblkno = nbp->b_lblkno;
 1098                         xflags = nbp->b_xflags &
 1099                                 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
 1100                 }
 1101                 retval = EAGAIN;
 1102                 error = BUF_TIMELOCK(bp,
 1103                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
 1104                     "flushbuf", slpflag, slptimeo);
 1105                 if (error) {
 1106                         BO_LOCK(bo);
 1107                         return (error != ENOLCK ? error : EAGAIN);
 1108                 }
 1109                 KASSERT(bp->b_bufobj == bo,
 1110                     ("bp %p wrong b_bufobj %p should be %p",
 1111                     bp, bp->b_bufobj, bo));
 1112                 if (bp->b_bufobj != bo) {       /* XXX: necessary ? */
 1113                         BUF_UNLOCK(bp);
 1114                         BO_LOCK(bo);
 1115                         return (EAGAIN);
 1116                 }
 1117                 /*
 1118                  * XXX Since there are no node locks for NFS, I
 1119                  * believe there is a slight chance that a delayed
 1120                  * write will occur while sleeping just above, so
 1121                  * check for it.
 1122                  */
 1123                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 1124                     (flags & V_SAVE)) {
 1125                         bremfree(bp);
 1126                         bp->b_flags |= B_ASYNC;
 1127                         bwrite(bp);
 1128                         BO_LOCK(bo);
 1129                         return (EAGAIN);        /* XXX: why not loop ? */
 1130                 }
 1131                 bremfree(bp);
 1132                 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 1133                 bp->b_flags &= ~B_ASYNC;
 1134                 brelse(bp);
 1135                 BO_LOCK(bo);
 1136                 if (nbp != NULL &&
 1137                     (nbp->b_bufobj != bo || 
 1138                      nbp->b_lblkno != lblkno ||
 1139                      (nbp->b_xflags &
 1140                       (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
 1141                         break;                  /* nbp invalid */
 1142         }
 1143         return (retval);
 1144 }
 1145 
 1146 /*
 1147  * Truncate a file's buffer and pages to a specified length.  This
 1148  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
 1149  * sync activity.
 1150  */
 1151 int
 1152 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize)
 1153 {
 1154         struct buf *bp, *nbp;
 1155         int anyfreed;
 1156         int trunclbn;
 1157         struct bufobj *bo;
 1158 
 1159         CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
 1160         /*
 1161          * Round up to the *next* lbn.
 1162          */
 1163         trunclbn = (length + blksize - 1) / blksize;
 1164 
 1165         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 1166 restart:
 1167         VI_LOCK(vp);
 1168         bo = &vp->v_bufobj;
 1169         anyfreed = 1;
 1170         for (;anyfreed;) {
 1171                 anyfreed = 0;
 1172                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 1173                         if (bp->b_lblkno < trunclbn)
 1174                                 continue;
 1175                         if (BUF_LOCK(bp,
 1176                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 1177                             VI_MTX(vp)) == ENOLCK)
 1178                                 goto restart;
 1179 
 1180                         bremfree(bp);
 1181                         bp->b_flags |= (B_INVAL | B_RELBUF);
 1182                         bp->b_flags &= ~B_ASYNC;
 1183                         brelse(bp);
 1184                         anyfreed = 1;
 1185 
 1186                         if (nbp != NULL &&
 1187                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 1188                             (nbp->b_vp != vp) ||
 1189                             (nbp->b_flags & B_DELWRI))) {
 1190                                 goto restart;
 1191                         }
 1192                         VI_LOCK(vp);
 1193                 }
 1194 
 1195                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 1196                         if (bp->b_lblkno < trunclbn)
 1197                                 continue;
 1198                         if (BUF_LOCK(bp,
 1199                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 1200                             VI_MTX(vp)) == ENOLCK)
 1201                                 goto restart;
 1202                         bremfree(bp);
 1203                         bp->b_flags |= (B_INVAL | B_RELBUF);
 1204                         bp->b_flags &= ~B_ASYNC;
 1205                         brelse(bp);
 1206                         anyfreed = 1;
 1207                         if (nbp != NULL &&
 1208                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 1209                             (nbp->b_vp != vp) ||
 1210                             (nbp->b_flags & B_DELWRI) == 0)) {
 1211                                 goto restart;
 1212                         }
 1213                         VI_LOCK(vp);
 1214                 }
 1215         }
 1216 
 1217         if (length > 0) {
 1218 restartsync:
 1219                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 1220                         if (bp->b_lblkno > 0)
 1221                                 continue;
 1222                         /*
 1223                          * Since we hold the vnode lock this should only
 1224                          * fail if we're racing with the buf daemon.
 1225                          */
 1226                         if (BUF_LOCK(bp,
 1227                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 1228                             VI_MTX(vp)) == ENOLCK) {
 1229                                 goto restart;
 1230                         }
 1231                         VNASSERT((bp->b_flags & B_DELWRI), vp,
 1232                             ("buf(%p) on dirty queue without DELWRI", bp));
 1233 
 1234                         bremfree(bp);
 1235                         bawrite(bp);
 1236                         VI_LOCK(vp);
 1237                         goto restartsync;
 1238                 }
 1239         }
 1240 
 1241         bufobj_wwait(bo, 0, 0);
 1242         VI_UNLOCK(vp);
 1243         vnode_pager_setsize(vp, length);
 1244 
 1245         return (0);
 1246 }
 1247 
 1248 /*
 1249  * buf_splay() - splay tree core for the clean/dirty list of buffers in
 1250  *               a vnode.
 1251  *
 1252  *      NOTE: We have to deal with the special case of a background bitmap
 1253  *      buffer, a situation where two buffers will have the same logical
 1254  *      block offset.  We want (1) only the foreground buffer to be accessed
 1255  *      in a lookup and (2) must differentiate between the foreground and
 1256  *      background buffer in the splay tree algorithm because the splay
 1257  *      tree cannot normally handle multiple entities with the same 'index'.
 1258  *      We accomplish this by adding differentiating flags to the splay tree's
 1259  *      numerical domain.
 1260  */
 1261 static
 1262 struct buf *
 1263 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 1264 {
 1265         struct buf dummy;
 1266         struct buf *lefttreemax, *righttreemin, *y;
 1267 
 1268         if (root == NULL)
 1269                 return (NULL);
 1270         lefttreemax = righttreemin = &dummy;
 1271         for (;;) {
 1272                 if (lblkno < root->b_lblkno ||
 1273                     (lblkno == root->b_lblkno &&
 1274                     (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 1275                         if ((y = root->b_left) == NULL)
 1276                                 break;
 1277                         if (lblkno < y->b_lblkno) {
 1278                                 /* Rotate right. */
 1279                                 root->b_left = y->b_right;
 1280                                 y->b_right = root;
 1281                                 root = y;
 1282                                 if ((y = root->b_left) == NULL)
 1283                                         break;
 1284                         }
 1285                         /* Link into the new root's right tree. */
 1286                         righttreemin->b_left = root;
 1287                         righttreemin = root;
 1288                 } else if (lblkno > root->b_lblkno ||
 1289                     (lblkno == root->b_lblkno &&
 1290                     (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 1291                         if ((y = root->b_right) == NULL)
 1292                                 break;
 1293                         if (lblkno > y->b_lblkno) {
 1294                                 /* Rotate left. */
 1295                                 root->b_right = y->b_left;
 1296                                 y->b_left = root;
 1297                                 root = y;
 1298                                 if ((y = root->b_right) == NULL)
 1299                                         break;
 1300                         }
 1301                         /* Link into the new root's left tree. */
 1302                         lefttreemax->b_right = root;
 1303                         lefttreemax = root;
 1304                 } else {
 1305                         break;
 1306                 }
 1307                 root = y;
 1308         }
 1309         /* Assemble the new root. */
 1310         lefttreemax->b_right = root->b_left;
 1311         righttreemin->b_left = root->b_right;
 1312         root->b_left = dummy.b_right;
 1313         root->b_right = dummy.b_left;
 1314         return (root);
 1315 }
 1316 
 1317 static void
 1318 buf_vlist_remove(struct buf *bp)
 1319 {
 1320         struct buf *root;
 1321         struct bufv *bv;
 1322 
 1323         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 1324         ASSERT_BO_LOCKED(bp->b_bufobj);
 1325         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 1326             (BX_VNDIRTY|BX_VNCLEAN),
 1327             ("buf_vlist_remove: Buf %p is on two lists", bp));
 1328         if (bp->b_xflags & BX_VNDIRTY) 
 1329                 bv = &bp->b_bufobj->bo_dirty;
 1330         else
 1331                 bv = &bp->b_bufobj->bo_clean;
 1332         if (bp != bv->bv_root) {
 1333                 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 1334                 KASSERT(root == bp, ("splay lookup failed in remove"));
 1335         }
 1336         if (bp->b_left == NULL) {
 1337                 root = bp->b_right;
 1338         } else {
 1339                 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 1340                 root->b_right = bp->b_right;
 1341         }
 1342         bv->bv_root = root;
 1343         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 1344         bv->bv_cnt--;
 1345         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 1346 }
 1347 
 1348 /*
 1349  * Add the buffer to the sorted clean or dirty block list using a
 1350  * splay tree algorithm.
 1351  *
 1352  * NOTE: xflags is passed as a constant, optimizing this inline function!
 1353  */
 1354 static void
 1355 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 1356 {
 1357         struct buf *root;
 1358         struct bufv *bv;
 1359 
 1360         ASSERT_BO_LOCKED(bo);
 1361         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 1362             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 1363         bp->b_xflags |= xflags;
 1364         if (xflags & BX_VNDIRTY)
 1365                 bv = &bo->bo_dirty;
 1366         else
 1367                 bv = &bo->bo_clean;
 1368 
 1369         root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 1370         if (root == NULL) {
 1371                 bp->b_left = NULL;
 1372                 bp->b_right = NULL;
 1373                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 1374         } else if (bp->b_lblkno < root->b_lblkno ||
 1375             (bp->b_lblkno == root->b_lblkno &&
 1376             (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 1377                 bp->b_left = root->b_left;
 1378                 bp->b_right = root;
 1379                 root->b_left = NULL;
 1380                 TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
 1381         } else {
 1382                 bp->b_right = root->b_right;
 1383                 bp->b_left = root;
 1384                 root->b_right = NULL;
 1385                 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
 1386         }
 1387         bv->bv_cnt++;
 1388         bv->bv_root = bp;
 1389 }
 1390 
 1391 /*
 1392  * Lookup a buffer using the splay tree.  Note that we specifically avoid
 1393  * shadow buffers used in background bitmap writes.
 1394  *
 1395  * This code isn't quite efficient as it could be because we are maintaining
 1396  * two sorted lists and do not know which list the block resides in.
 1397  *
 1398  * During a "make buildworld" the desired buffer is found at one of
 1399  * the roots more than 60% of the time.  Thus, checking both roots
 1400  * before performing either splay eliminates unnecessary splays on the
 1401  * first tree splayed.
 1402  */
 1403 struct buf *
 1404 gbincore(struct bufobj *bo, daddr_t lblkno)
 1405 {
 1406         struct buf *bp;
 1407 
 1408         ASSERT_BO_LOCKED(bo);
 1409         if ((bp = bo->bo_clean.bv_root) != NULL &&
 1410             bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 1411                 return (bp);
 1412         if ((bp = bo->bo_dirty.bv_root) != NULL &&
 1413             bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 1414                 return (bp);
 1415         if ((bp = bo->bo_clean.bv_root) != NULL) {
 1416                 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
 1417                 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 1418                         return (bp);
 1419         }
 1420         if ((bp = bo->bo_dirty.bv_root) != NULL) {
 1421                 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
 1422                 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 1423                         return (bp);
 1424         }
 1425         return (NULL);
 1426 }
 1427 
 1428 /*
 1429  * Associate a buffer with a vnode.
 1430  */
 1431 void
 1432 bgetvp(struct vnode *vp, struct buf *bp)
 1433 {
 1434 
 1435         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 1436 
 1437         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 1438         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 1439             ("bgetvp: bp already attached! %p", bp));
 1440 
 1441         ASSERT_VI_LOCKED(vp, "bgetvp");
 1442         vholdl(vp);
 1443         bp->b_vp = vp;
 1444         bp->b_bufobj = &vp->v_bufobj;
 1445         /*
 1446          * Insert onto list for new vnode.
 1447          */
 1448         buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
 1449 }
 1450 
 1451 /*
 1452  * Disassociate a buffer from a vnode.
 1453  */
 1454 void
 1455 brelvp(struct buf *bp)
 1456 {
 1457         struct bufobj *bo;
 1458         struct vnode *vp;
 1459 
 1460         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 1461         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 1462 
 1463         /*
 1464          * Delete from old vnode list, if on one.
 1465          */
 1466         vp = bp->b_vp;          /* XXX */
 1467         bo = bp->b_bufobj;
 1468         BO_LOCK(bo);
 1469         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 1470                 buf_vlist_remove(bp);
 1471         else
 1472                 panic("brelvp: Buffer %p not on queue.", bp);
 1473         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 1474                 bo->bo_flag &= ~BO_ONWORKLST;
 1475                 mtx_lock(&sync_mtx);
 1476                 LIST_REMOVE(bo, bo_synclist);
 1477                 syncer_worklist_len--;
 1478                 mtx_unlock(&sync_mtx);
 1479         }
 1480         bp->b_vp = NULL;
 1481         bp->b_bufobj = NULL;
 1482         vdropl(vp);
 1483 }
 1484 
 1485 /*
 1486  * Add an item to the syncer work queue.
 1487  */
 1488 static void
 1489 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 1490 {
 1491         int slot;
 1492 
 1493         ASSERT_BO_LOCKED(bo);
 1494 
 1495         mtx_lock(&sync_mtx);
 1496         if (bo->bo_flag & BO_ONWORKLST)
 1497                 LIST_REMOVE(bo, bo_synclist);
 1498         else {
 1499                 bo->bo_flag |= BO_ONWORKLST;
 1500                 syncer_worklist_len++;
 1501         }
 1502 
 1503         if (delay > syncer_maxdelay - 2)
 1504                 delay = syncer_maxdelay - 2;
 1505         slot = (syncer_delayno + delay) & syncer_mask;
 1506 
 1507         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 1508         mtx_unlock(&sync_mtx);
 1509 }
 1510 
 1511 static int
 1512 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 1513 {
 1514         int error, len;
 1515 
 1516         mtx_lock(&sync_mtx);
 1517         len = syncer_worklist_len - sync_vnode_count;
 1518         mtx_unlock(&sync_mtx);
 1519         error = SYSCTL_OUT(req, &len, sizeof(len));
 1520         return (error);
 1521 }
 1522 
 1523 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
 1524     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 1525 
 1526 static struct proc *updateproc;
 1527 static void sched_sync(void);
 1528 static struct kproc_desc up_kp = {
 1529         "syncer",
 1530         sched_sync,
 1531         &updateproc
 1532 };
 1533 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 1534 
 1535 static int
 1536 sync_vnode(struct bufobj *bo, struct thread *td)
 1537 {
 1538         struct vnode *vp;
 1539         struct mount *mp;
 1540 
 1541         vp = bo->__bo_vnode;    /* XXX */
 1542         if (VOP_ISLOCKED(vp, NULL) != 0)
 1543                 return (1);
 1544         if (VI_TRYLOCK(vp) == 0)
 1545                 return (1);
 1546         /*
 1547          * We use vhold in case the vnode does not
 1548          * successfully sync.  vhold prevents the vnode from
 1549          * going away when we unlock the sync_mtx so that
 1550          * we can acquire the vnode interlock.
 1551          */
 1552         vholdl(vp);
 1553         mtx_unlock(&sync_mtx);
 1554         VI_UNLOCK(vp);
 1555         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 1556                 vdrop(vp);
 1557                 mtx_lock(&sync_mtx);
 1558                 return (1);
 1559         }
 1560         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 1561         (void) VOP_FSYNC(vp, MNT_LAZY, td);
 1562         VOP_UNLOCK(vp, 0, td);
 1563         vn_finished_write(mp);
 1564         VI_LOCK(vp);
 1565         if ((bo->bo_flag & BO_ONWORKLST) != 0) {
 1566                 /*
 1567                  * Put us back on the worklist.  The worklist
 1568                  * routine will remove us from our current
 1569                  * position and then add us back in at a later
 1570                  * position.
 1571                  */
 1572                 vn_syncer_add_to_worklist(bo, syncdelay);
 1573         }
 1574         vdropl(vp);
 1575         mtx_lock(&sync_mtx);
 1576         return (0);
 1577 }
 1578 
 1579 /*
 1580  * System filesystem synchronizer daemon.
 1581  */
 1582 static void
 1583 sched_sync(void)
 1584 {
 1585         struct synclist *next;
 1586         struct synclist *slp;
 1587         struct bufobj *bo;
 1588         long starttime;
 1589         struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
 1590         static int dummychan;
 1591         int last_work_seen;
 1592         int net_worklist_len;
 1593         int syncer_final_iter;
 1594         int first_printf;
 1595         int error;
 1596 
 1597         mtx_lock(&Giant);
 1598         last_work_seen = 0;
 1599         syncer_final_iter = 0;
 1600         first_printf = 1;
 1601         syncer_state = SYNCER_RUNNING;
 1602         starttime = time_second;
 1603         td->td_pflags |= TDP_NORUNNINGBUF;
 1604 
 1605         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 1606             SHUTDOWN_PRI_LAST);
 1607 
 1608         for (;;) {
 1609                 mtx_lock(&sync_mtx);
 1610                 if (syncer_state == SYNCER_FINAL_DELAY &&
 1611                     syncer_final_iter == 0) {
 1612                         mtx_unlock(&sync_mtx);
 1613                         kthread_suspend_check(td->td_proc);
 1614                         mtx_lock(&sync_mtx);
 1615                 }
 1616                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
 1617                 if (syncer_state != SYNCER_RUNNING &&
 1618                     starttime != time_second) {
 1619                         if (first_printf) {
 1620                                 printf("\nSyncing disks, vnodes remaining...");
 1621                                 first_printf = 0;
 1622                         }
 1623                         printf("%d ", net_worklist_len);
 1624                 }
 1625                 starttime = time_second;
 1626 
 1627                 /*
 1628                  * Push files whose dirty time has expired.  Be careful
 1629                  * of interrupt race on slp queue.
 1630                  *
 1631                  * Skip over empty worklist slots when shutting down.
 1632                  */
 1633                 do {
 1634                         slp = &syncer_workitem_pending[syncer_delayno];
 1635                         syncer_delayno += 1;
 1636                         if (syncer_delayno == syncer_maxdelay)
 1637                                 syncer_delayno = 0;
 1638                         next = &syncer_workitem_pending[syncer_delayno];
 1639                         /*
 1640                          * If the worklist has wrapped since the
 1641                          * it was emptied of all but syncer vnodes, 
 1642                          * switch to the FINAL_DELAY state and run
 1643                          * for one more second.
 1644                          */
 1645                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
 1646                             net_worklist_len == 0 &&
 1647                             last_work_seen == syncer_delayno) {
 1648                                 syncer_state = SYNCER_FINAL_DELAY;
 1649                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 1650                         }
 1651                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 1652                     syncer_worklist_len > 0);
 1653 
 1654                 /*
 1655                  * Keep track of the last time there was anything
 1656                  * on the worklist other than syncer vnodes.
 1657                  * Return to the SHUTTING_DOWN state if any
 1658                  * new work appears.
 1659                  */
 1660                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 1661                         last_work_seen = syncer_delayno;
 1662                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 1663                         syncer_state = SYNCER_SHUTTING_DOWN;
 1664                 while ((bo = LIST_FIRST(slp)) != NULL) {
 1665                         error = sync_vnode(bo, td);
 1666                         if (error == 1) {
 1667                                 LIST_REMOVE(bo, bo_synclist);
 1668                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
 1669                                 continue;
 1670                         }
 1671                 }
 1672                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 1673                         syncer_final_iter--;
 1674                 mtx_unlock(&sync_mtx);
 1675 
 1676                 /*
 1677                  * Do soft update processing.
 1678                  */
 1679                 if (softdep_process_worklist_hook != NULL)
 1680                         (*softdep_process_worklist_hook)(NULL);
 1681 
 1682                 /*
 1683                  * The variable rushjob allows the kernel to speed up the
 1684                  * processing of the filesystem syncer process. A rushjob
 1685                  * value of N tells the filesystem syncer to process the next
 1686                  * N seconds worth of work on its queue ASAP. Currently rushjob
 1687                  * is used by the soft update code to speed up the filesystem
 1688                  * syncer process when the incore state is getting so far
 1689                  * ahead of the disk that the kernel memory pool is being
 1690                  * threatened with exhaustion.
 1691                  */
 1692                 mtx_lock(&sync_mtx);
 1693                 if (rushjob > 0) {
 1694                         rushjob -= 1;
 1695                         mtx_unlock(&sync_mtx);
 1696                         continue;
 1697                 }
 1698                 mtx_unlock(&sync_mtx);
 1699                 /*
 1700                  * Just sleep for a short period if time between
 1701                  * iterations when shutting down to allow some I/O
 1702                  * to happen.
 1703                  *
 1704                  * If it has taken us less than a second to process the
 1705                  * current work, then wait. Otherwise start right over
 1706                  * again. We can still lose time if any single round
 1707                  * takes more than two seconds, but it does not really
 1708                  * matter as we are just trying to generally pace the
 1709                  * filesystem activity.
 1710                  */
 1711                 if (syncer_state != SYNCER_RUNNING)
 1712                         tsleep(&dummychan, PPAUSE, "syncfnl",
 1713                             hz / SYNCER_SHUTDOWN_SPEEDUP);
 1714                 else if (time_second == starttime)
 1715                         tsleep(&lbolt, PPAUSE, "syncer", 0);
 1716         }
 1717 }
 1718 
 1719 /*
 1720  * Request the syncer daemon to speed up its work.
 1721  * We never push it to speed up more than half of its
 1722  * normal turn time, otherwise it could take over the cpu.
 1723  */
 1724 int
 1725 speedup_syncer()
 1726 {
 1727         struct thread *td;
 1728         int ret = 0;
 1729 
 1730         td = FIRST_THREAD_IN_PROC(updateproc);
 1731         sleepq_remove(td, &lbolt);
 1732         mtx_lock(&sync_mtx);
 1733         if (rushjob < syncdelay / 2) {
 1734                 rushjob += 1;
 1735                 stat_rush_requests += 1;
 1736                 ret = 1;
 1737         }
 1738         mtx_unlock(&sync_mtx);
 1739         return (ret);
 1740 }
 1741 
 1742 /*
 1743  * Tell the syncer to speed up its work and run though its work
 1744  * list several times, then tell it to shut down.
 1745  */
 1746 static void
 1747 syncer_shutdown(void *arg, int howto)
 1748 {
 1749         struct thread *td;
 1750 
 1751         if (howto & RB_NOSYNC)
 1752                 return;
 1753         td = FIRST_THREAD_IN_PROC(updateproc);
 1754         sleepq_remove(td, &lbolt);
 1755         mtx_lock(&sync_mtx);
 1756         syncer_state = SYNCER_SHUTTING_DOWN;
 1757         rushjob = 0;
 1758         mtx_unlock(&sync_mtx);
 1759         kproc_shutdown(arg, howto);
 1760 }
 1761 
 1762 /*
 1763  * Reassign a buffer from one vnode to another.
 1764  * Used to assign file specific control information
 1765  * (indirect blocks) to the vnode to which they belong.
 1766  */
 1767 void
 1768 reassignbuf(struct buf *bp)
 1769 {
 1770         struct vnode *vp;
 1771         struct bufobj *bo;
 1772         int delay;
 1773 #ifdef INVARIANTS
 1774         struct bufv *bv;
 1775 #endif
 1776 
 1777         vp = bp->b_vp;
 1778         bo = bp->b_bufobj;
 1779         ++reassignbufcalls;
 1780 
 1781         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 1782             bp, bp->b_vp, bp->b_flags);
 1783         /*
 1784          * B_PAGING flagged buffers cannot be reassigned because their vp
 1785          * is not fully linked in.
 1786          */
 1787         if (bp->b_flags & B_PAGING)
 1788                 panic("cannot reassign paging buffer");
 1789 
 1790         /*
 1791          * Delete from old vnode list, if on one.
 1792          */
 1793         VI_LOCK(vp);
 1794         if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 1795                 buf_vlist_remove(bp);
 1796         else
 1797                 panic("reassignbuf: Buffer %p not on queue.", bp);
 1798         /*
 1799          * If dirty, put on list of dirty buffers; otherwise insert onto list
 1800          * of clean buffers.
 1801          */
 1802         if (bp->b_flags & B_DELWRI) {
 1803                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 1804                         switch (vp->v_type) {
 1805                         case VDIR:
 1806                                 delay = dirdelay;
 1807                                 break;
 1808                         case VCHR:
 1809                                 delay = metadelay;
 1810                                 break;
 1811                         default:
 1812                                 delay = filedelay;
 1813                         }
 1814                         vn_syncer_add_to_worklist(bo, delay);
 1815                 }
 1816                 buf_vlist_add(bp, bo, BX_VNDIRTY);
 1817         } else {
 1818                 buf_vlist_add(bp, bo, BX_VNCLEAN);
 1819 
 1820                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 1821                         mtx_lock(&sync_mtx);
 1822                         LIST_REMOVE(bo, bo_synclist);
 1823                         syncer_worklist_len--;
 1824                         mtx_unlock(&sync_mtx);
 1825                         bo->bo_flag &= ~BO_ONWORKLST;
 1826                 }
 1827         }
 1828 #ifdef INVARIANTS
 1829         bv = &bo->bo_clean;
 1830         bp = TAILQ_FIRST(&bv->bv_hd);
 1831         KASSERT(bp == NULL || bp->b_bufobj == bo,
 1832             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 1833         bp = TAILQ_LAST(&bv->bv_hd, buflists);
 1834         KASSERT(bp == NULL || bp->b_bufobj == bo,
 1835             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 1836         bv = &bo->bo_dirty;
 1837         bp = TAILQ_FIRST(&bv->bv_hd);
 1838         KASSERT(bp == NULL || bp->b_bufobj == bo,
 1839             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 1840         bp = TAILQ_LAST(&bv->bv_hd, buflists);
 1841         KASSERT(bp == NULL || bp->b_bufobj == bo,
 1842             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 1843 #endif
 1844         VI_UNLOCK(vp);
 1845 }
 1846 
 1847 /*
 1848  * Increment the use and hold counts on the vnode, taking care to reference
 1849  * the driver's usecount if this is a chardev.  The vholdl() will remove
 1850  * the vnode from the free list if it is presently free.  Requires the
 1851  * vnode interlock and returns with it held.
 1852  */
 1853 static void
 1854 v_incr_usecount(struct vnode *vp)
 1855 {
 1856 
 1857         CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
 1858             vp, vp->v_holdcnt, vp->v_usecount);
 1859         vp->v_usecount++;
 1860         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 1861                 dev_lock();
 1862                 vp->v_rdev->si_usecount++;
 1863                 dev_unlock();
 1864         }
 1865         vholdl(vp);
 1866 }
 1867 
 1868 /*
 1869  * Decrement the vnode use and hold count along with the driver's usecount
 1870  * if this is a chardev.  The vdropl() below releases the vnode interlock
 1871  * as it may free the vnode.
 1872  */
 1873 static void
 1874 v_decr_usecount(struct vnode *vp)
 1875 {
 1876 
 1877         CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
 1878             vp, vp->v_holdcnt, vp->v_usecount);
 1879         ASSERT_VI_LOCKED(vp, __FUNCTION__);
 1880         VNASSERT(vp->v_usecount > 0, vp,
 1881             ("v_decr_usecount: negative usecount"));
 1882         vp->v_usecount--;
 1883         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 1884                 dev_lock();
 1885                 vp->v_rdev->si_usecount--;
 1886                 dev_unlock();
 1887         }
 1888         vdropl(vp);
 1889 }
 1890 
 1891 /*
 1892  * Decrement only the use count and driver use count.  This is intended to
 1893  * be paired with a follow on vdropl() to release the remaining hold count.
 1894  * In this way we may vgone() a vnode with a 0 usecount without risk of
 1895  * having it end up on a free list because the hold count is kept above 0.
 1896  */
 1897 static void
 1898 v_decr_useonly(struct vnode *vp)
 1899 {
 1900 
 1901         CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
 1902             vp, vp->v_holdcnt, vp->v_usecount);
 1903         ASSERT_VI_LOCKED(vp, __FUNCTION__);
 1904         VNASSERT(vp->v_usecount > 0, vp,
 1905             ("v_decr_useonly: negative usecount"));
 1906         vp->v_usecount--;
 1907         if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 1908                 dev_lock();
 1909                 vp->v_rdev->si_usecount--;
 1910                 dev_unlock();
 1911         }
 1912 }
 1913 
 1914 /*
 1915  * Grab a particular vnode from the free list, increment its
 1916  * reference count and lock it. The vnode lock bit is set if the
 1917  * vnode is being eliminated in vgone. The process is awakened
 1918  * when the transition is completed, and an error returned to
 1919  * indicate that the vnode is no longer usable (possibly having
 1920  * been changed to a new filesystem type).
 1921  */
 1922 int
 1923 vget(vp, flags, td)
 1924         struct vnode *vp;
 1925         int flags;
 1926         struct thread *td;
 1927 {
 1928         int oweinact;
 1929         int oldflags;
 1930         int error;
 1931 
 1932         error = 0;
 1933         oldflags = flags;
 1934         oweinact = 0;
 1935         if ((flags & LK_INTERLOCK) == 0)
 1936                 VI_LOCK(vp);
 1937         /*
 1938          * If the inactive call was deferred because vput() was called
 1939          * with a shared lock, we have to do it here before another thread
 1940          * gets a reference to data that should be dead.
 1941          */
 1942         if (vp->v_iflag & VI_OWEINACT) {
 1943                 if (flags & LK_NOWAIT) {
 1944                         VI_UNLOCK(vp);
 1945                         return (EBUSY);
 1946                 }
 1947                 flags &= ~LK_TYPE_MASK;
 1948                 flags |= LK_EXCLUSIVE;
 1949                 oweinact = 1;
 1950         }
 1951         v_incr_usecount(vp);
 1952         if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
 1953                 VI_LOCK(vp);
 1954                 /*
 1955                  * must expand vrele here because we do not want
 1956                  * to call VOP_INACTIVE if the reference count
 1957                  * drops back to zero since it was never really
 1958                  * active.
 1959                  */
 1960                 v_decr_usecount(vp);
 1961                 return (error);
 1962         }
 1963         if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 1964                 panic("vget: vn_lock failed to return ENOENT\n");
 1965         if (oweinact) {
 1966                 VI_LOCK(vp);
 1967                 if (vp->v_iflag & VI_OWEINACT)
 1968                         vinactive(vp, td);
 1969                 VI_UNLOCK(vp);
 1970                 if ((oldflags & LK_TYPE_MASK) == 0)
 1971                         VOP_UNLOCK(vp, 0, td);
 1972         }
 1973         return (0);
 1974 }
 1975 
 1976 /*
 1977  * Increase the reference count of a vnode.
 1978  */
 1979 void
 1980 vref(struct vnode *vp)
 1981 {
 1982 
 1983         VI_LOCK(vp);
 1984         v_incr_usecount(vp);
 1985         VI_UNLOCK(vp);
 1986 }
 1987 
 1988 /*
 1989  * Return reference count of a vnode.
 1990  *
 1991  * The results of this call are only guaranteed when some mechanism other
 1992  * than the VI lock is used to stop other processes from gaining references
 1993  * to the vnode.  This may be the case if the caller holds the only reference.
 1994  * This is also useful when stale data is acceptable as race conditions may
 1995  * be accounted for by some other means.
 1996  */
 1997 int
 1998 vrefcnt(struct vnode *vp)
 1999 {
 2000         int usecnt;
 2001 
 2002         VI_LOCK(vp);
 2003         usecnt = vp->v_usecount;
 2004         VI_UNLOCK(vp);
 2005 
 2006         return (usecnt);
 2007 }
 2008 
 2009 
 2010 /*
 2011  * Vnode put/release.
 2012  * If count drops to zero, call inactive routine and return to freelist.
 2013  */
 2014 void
 2015 vrele(vp)
 2016         struct vnode *vp;
 2017 {
 2018         struct thread *td = curthread;  /* XXX */
 2019 
 2020         KASSERT(vp != NULL, ("vrele: null vp"));
 2021 
 2022         VI_LOCK(vp);
 2023 
 2024         /* Skip this v_writecount check if we're going to panic below. */
 2025         VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 2026             ("vrele: missed vn_close"));
 2027 
 2028         if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 2029             vp->v_usecount == 1)) {
 2030                 v_decr_usecount(vp);
 2031                 return;
 2032         }
 2033         if (vp->v_usecount != 1) {
 2034 #ifdef DIAGNOSTIC
 2035                 vprint("vrele: negative ref count", vp);
 2036 #endif
 2037                 VI_UNLOCK(vp);
 2038                 panic("vrele: negative ref cnt");
 2039         }
 2040         /*
 2041          * We want to hold the vnode until the inactive finishes to
 2042          * prevent vgone() races.  We drop the use count here and the
 2043          * hold count below when we're done.
 2044          */
 2045         v_decr_useonly(vp);
 2046         /*
 2047          * We must call VOP_INACTIVE with the node locked. Mark
 2048          * as VI_DOINGINACT to avoid recursion.
 2049          */
 2050         if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
 2051                 VI_LOCK(vp);
 2052                 vinactive(vp, td);
 2053                 VOP_UNLOCK(vp, 0, td);
 2054         } else
 2055                 VI_LOCK(vp);
 2056         vdropl(vp);
 2057 }
 2058 
 2059 /*
 2060  * Release an already locked vnode.  This give the same effects as
 2061  * unlock+vrele(), but takes less time and avoids releasing and
 2062  * re-aquiring the lock (as vrele() aquires the lock internally.)
 2063  */
 2064 void
 2065 vput(vp)
 2066         struct vnode *vp;
 2067 {
 2068         struct thread *td = curthread;  /* XXX */
 2069         int error;
 2070 
 2071         KASSERT(vp != NULL, ("vput: null vp"));
 2072         ASSERT_VOP_LOCKED(vp, "vput");
 2073         VI_LOCK(vp);
 2074         /* Skip this v_writecount check if we're going to panic below. */
 2075         VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 2076             ("vput: missed vn_close"));
 2077         error = 0;
 2078 
 2079         if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 2080             vp->v_usecount == 1)) {
 2081                 VOP_UNLOCK(vp, 0, td);
 2082                 v_decr_usecount(vp);
 2083                 return;
 2084         }
 2085 
 2086         if (vp->v_usecount != 1) {
 2087 #ifdef DIAGNOSTIC
 2088                 vprint("vput: negative ref count", vp);
 2089 #endif
 2090                 panic("vput: negative ref cnt");
 2091         }
 2092         /*
 2093          * We want to hold the vnode until the inactive finishes to
 2094          * prevent vgone() races.  We drop the use count here and the
 2095          * hold count below when we're done.
 2096          */
 2097         v_decr_useonly(vp);
 2098         vp->v_iflag |= VI_OWEINACT;
 2099         if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
 2100                 error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
 2101                 VI_LOCK(vp);
 2102                 if (error)
 2103                         goto done;
 2104         }
 2105         if (vp->v_iflag & VI_OWEINACT)
 2106                 vinactive(vp, td);
 2107         VOP_UNLOCK(vp, 0, td);
 2108 done:
 2109         vdropl(vp);
 2110 }
 2111 
 2112 /*
 2113  * Somebody doesn't want the vnode recycled.
 2114  */
 2115 void
 2116 vhold(struct vnode *vp)
 2117 {
 2118 
 2119         VI_LOCK(vp);
 2120         vholdl(vp);
 2121         VI_UNLOCK(vp);
 2122 }
 2123 
 2124 void
 2125 vholdl(struct vnode *vp)
 2126 {
 2127 
 2128         vp->v_holdcnt++;
 2129         if (VSHOULDBUSY(vp))
 2130                 vbusy(vp);
 2131 }
 2132 
 2133 /*
 2134  * Note that there is one less who cares about this vnode.  vdrop() is the
 2135  * opposite of vhold().
 2136  */
 2137 void
 2138 vdrop(struct vnode *vp)
 2139 {
 2140 
 2141         VI_LOCK(vp);
 2142         vdropl(vp);
 2143 }
 2144 
 2145 /*
 2146  * Drop the hold count of the vnode.  If this is the last reference to
 2147  * the vnode we will free it if it has been vgone'd otherwise it is
 2148  * placed on the free list.
 2149  */
 2150 static void
 2151 vdropl(struct vnode *vp)
 2152 {
 2153 
 2154         if (vp->v_holdcnt <= 0)
 2155                 panic("vdrop: holdcnt %d", vp->v_holdcnt);
 2156         vp->v_holdcnt--;
 2157         if (vp->v_holdcnt == 0) {
 2158                 if (vp->v_iflag & VI_DOOMED) {
 2159                         vdestroy(vp);
 2160                         return;
 2161                 } else
 2162                         vfree(vp);
 2163         }
 2164         VI_UNLOCK(vp);
 2165 }
 2166 
 2167 /*
 2168  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
 2169  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
 2170  * OWEINACT tracks whether a vnode missed a call to inactive due to a
 2171  * failed lock upgrade.
 2172  */
 2173 static void
 2174 vinactive(struct vnode *vp, struct thread *td)
 2175 {
 2176 
 2177         ASSERT_VOP_LOCKED(vp, "vinactive");
 2178         ASSERT_VI_LOCKED(vp, "vinactive");
 2179         VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 2180             ("vinactive: recursed on VI_DOINGINACT"));
 2181         vp->v_iflag |= VI_DOINGINACT;
 2182         vp->v_iflag &= ~VI_OWEINACT;
 2183         VI_UNLOCK(vp);
 2184         VOP_INACTIVE(vp, td);
 2185         VI_LOCK(vp);
 2186         VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 2187             ("vinactive: lost VI_DOINGINACT"));
 2188         vp->v_iflag &= ~VI_DOINGINACT;
 2189 }
 2190 
 2191 /*
 2192  * Remove any vnodes in the vnode table belonging to mount point mp.
 2193  *
 2194  * If FORCECLOSE is not specified, there should not be any active ones,
 2195  * return error if any are found (nb: this is a user error, not a
 2196  * system error). If FORCECLOSE is specified, detach any active vnodes
 2197  * that are found.
 2198  *
 2199  * If WRITECLOSE is set, only flush out regular file vnodes open for
 2200  * writing.
 2201  *
 2202  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
 2203  *
 2204  * `rootrefs' specifies the base reference count for the root vnode
 2205  * of this filesystem. The root vnode is considered busy if its
 2206  * v_usecount exceeds this value. On a successful return, vflush(, td)
 2207  * will call vrele() on the root vnode exactly rootrefs times.
 2208  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
 2209  * be zero.
 2210  */
 2211 #ifdef DIAGNOSTIC
 2212 static int busyprt = 0;         /* print out busy vnodes */
 2213 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 2214 #endif
 2215 
 2216 int
 2217 vflush(mp, rootrefs, flags, td)
 2218         struct mount *mp;
 2219         int rootrefs;
 2220         int flags;
 2221         struct thread *td;
 2222 {
 2223         struct vnode *vp, *nvp, *rootvp = NULL;
 2224         struct vattr vattr;
 2225         int busy = 0, error;
 2226 
 2227         CTR1(KTR_VFS, "vflush: mp %p", mp);
 2228         if (rootrefs > 0) {
 2229                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 2230                     ("vflush: bad args"));
 2231                 /*
 2232                  * Get the filesystem root vnode. We can vput() it
 2233                  * immediately, since with rootrefs > 0, it won't go away.
 2234                  */
 2235                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
 2236                         return (error);
 2237                 vput(rootvp);
 2238 
 2239         }
 2240         MNT_ILOCK(mp);
 2241 loop:
 2242         MNT_VNODE_FOREACH(vp, mp, nvp) {
 2243 
 2244                 VI_LOCK(vp);
 2245                 vholdl(vp);
 2246                 MNT_IUNLOCK(mp);
 2247                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
 2248                 if (error) {
 2249                         vdrop(vp);
 2250                         MNT_ILOCK(mp);
 2251                         goto loop;
 2252                 }
 2253                 /*
 2254                  * Skip over a vnodes marked VV_SYSTEM.
 2255                  */
 2256                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 2257                         VOP_UNLOCK(vp, 0, td);
 2258                         vdrop(vp);
 2259                         MNT_ILOCK(mp);
 2260                         continue;
 2261                 }
 2262                 /*
 2263                  * If WRITECLOSE is set, flush out unlinked but still open
 2264                  * files (even if open only for reading) and regular file
 2265                  * vnodes open for writing.
 2266                  */
 2267                 if (flags & WRITECLOSE) {
 2268                         error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 2269                         VI_LOCK(vp);
 2270 
 2271                         if ((vp->v_type == VNON ||
 2272                             (error == 0 && vattr.va_nlink > 0)) &&
 2273                             (vp->v_writecount == 0 || vp->v_type != VREG)) {
 2274                                 VOP_UNLOCK(vp, 0, td);
 2275                                 vdropl(vp);
 2276                                 MNT_ILOCK(mp);
 2277                                 continue;
 2278                         }
 2279                 } else
 2280                         VI_LOCK(vp);
 2281                 /*
 2282                  * With v_usecount == 0, all we need to do is clear out the
 2283                  * vnode data structures and we are done.
 2284                  *
 2285                  * If FORCECLOSE is set, forcibly close the vnode.
 2286                  */
 2287                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 2288                         VNASSERT(vp->v_usecount == 0 ||
 2289                             (vp->v_type != VCHR && vp->v_type != VBLK), vp,
 2290                             ("device VNODE %p is FORCECLOSED", vp));
 2291                         vgonel(vp);
 2292                 } else {
 2293                         busy++;
 2294 #ifdef DIAGNOSTIC
 2295                         if (busyprt)
 2296                                 vprint("vflush: busy vnode", vp);
 2297 #endif
 2298                 }
 2299                 VOP_UNLOCK(vp, 0, td);
 2300                 vdropl(vp);
 2301                 MNT_ILOCK(mp);
 2302         }
 2303         MNT_IUNLOCK(mp);
 2304         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 2305                 /*
 2306                  * If just the root vnode is busy, and if its refcount
 2307                  * is equal to `rootrefs', then go ahead and kill it.
 2308                  */
 2309                 VI_LOCK(rootvp);
 2310                 KASSERT(busy > 0, ("vflush: not busy"));
 2311                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 2312                     ("vflush: usecount %d < rootrefs %d",
 2313                      rootvp->v_usecount, rootrefs));
 2314                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
 2315                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
 2316                         vgone(rootvp);
 2317                         VOP_UNLOCK(rootvp, 0, td);
 2318                         busy = 0;
 2319                 } else
 2320                         VI_UNLOCK(rootvp);
 2321         }
 2322         if (busy)
 2323                 return (EBUSY);
 2324         for (; rootrefs > 0; rootrefs--)
 2325                 vrele(rootvp);
 2326         return (0);
 2327 }
 2328 
 2329 /*
 2330  * Recycle an unused vnode to the front of the free list.
 2331  */
 2332 int
 2333 vrecycle(struct vnode *vp, struct thread *td)
 2334 {
 2335         int recycled;
 2336 
 2337         ASSERT_VOP_LOCKED(vp, "vrecycle");
 2338         recycled = 0;
 2339         VI_LOCK(vp);
 2340         if (vp->v_usecount == 0) {
 2341                 recycled = 1;
 2342                 vgonel(vp);
 2343         }
 2344         VI_UNLOCK(vp);
 2345         return (recycled);
 2346 }
 2347 
 2348 /*
 2349  * Eliminate all activity associated with a vnode
 2350  * in preparation for reuse.
 2351  */
 2352 void
 2353 vgone(struct vnode *vp)
 2354 {
 2355         VI_LOCK(vp);
 2356         vgonel(vp);
 2357         VI_UNLOCK(vp);
 2358 }
 2359 
 2360 /*
 2361  * vgone, with the vp interlock held.
 2362  */
 2363 void
 2364 vgonel(struct vnode *vp)
 2365 {
 2366         struct thread *td;
 2367         int oweinact;
 2368         int active;
 2369 
 2370         CTR1(KTR_VFS, "vgonel: vp %p", vp);
 2371         ASSERT_VOP_LOCKED(vp, "vgonel");
 2372         ASSERT_VI_LOCKED(vp, "vgonel");
 2373 #if 0
 2374         /* XXX Need to fix ttyvp before I enable this. */
 2375         VNASSERT(vp->v_holdcnt, vp,
 2376             ("vgonel: vp %p has no reference.", vp));
 2377 #endif
 2378         td = curthread;
 2379 
 2380         /*
 2381          * Don't vgonel if we're already doomed.
 2382          */
 2383         if (vp->v_iflag & VI_DOOMED)
 2384                 return;
 2385         vp->v_iflag |= VI_DOOMED;
 2386         /*
 2387          * Check to see if the vnode is in use.  If so, we have to call
 2388          * VOP_CLOSE() and VOP_INACTIVE().
 2389          */
 2390         active = vp->v_usecount;
 2391         oweinact = (vp->v_iflag & VI_OWEINACT);
 2392         VI_UNLOCK(vp);
 2393         /*
 2394          * Clean out any buffers associated with the vnode.
 2395          * If the flush fails, just toss the buffers.
 2396          */
 2397         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 2398                 (void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 2399         if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
 2400                 vinvalbuf(vp, 0, td, 0, 0);
 2401 
 2402         /*
 2403          * If purging an active vnode, it must be closed and
 2404          * deactivated before being reclaimed.
 2405          */
 2406         if (active)
 2407                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 2408         if (oweinact || active) {
 2409                 VI_LOCK(vp);
 2410                 if ((vp->v_iflag & VI_DOINGINACT) == 0)
 2411                         vinactive(vp, td);
 2412                 VI_UNLOCK(vp);
 2413         }
 2414         /*
 2415          * Reclaim the vnode.
 2416          */
 2417         if (VOP_RECLAIM(vp, td))
 2418                 panic("vgone: cannot reclaim");
 2419         VNASSERT(vp->v_object == NULL, vp,
 2420             ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 2421         /*
 2422          * Delete from old mount point vnode list.
 2423          */
 2424         delmntque(vp);
 2425         cache_purge(vp);
 2426         /*
 2427          * Done with purge, reset to the standard lock and invalidate
 2428          * the vnode.
 2429          */
 2430         VI_LOCK(vp);
 2431         vp->v_vnlock = &vp->v_lock;
 2432         vp->v_op = &dead_vnodeops;
 2433         vp->v_tag = "none";
 2434         vp->v_type = VBAD;
 2435 }
 2436 
 2437 /*
 2438  * Calculate the total number of references to a special device.
 2439  */
 2440 int
 2441 vcount(vp)
 2442         struct vnode *vp;
 2443 {
 2444         int count;
 2445 
 2446         dev_lock();
 2447         count = vp->v_rdev->si_usecount;
 2448         dev_unlock();
 2449         return (count);
 2450 }
 2451 
 2452 /*
 2453  * Same as above, but using the struct cdev *as argument
 2454  */
 2455 int
 2456 count_dev(dev)
 2457         struct cdev *dev;
 2458 {
 2459         int count;
 2460 
 2461         dev_lock();
 2462         count = dev->si_usecount;
 2463         dev_unlock();
 2464         return(count);
 2465 }
 2466 
 2467 /*
 2468  * Print out a description of a vnode.
 2469  */
 2470 static char *typename[] =
 2471 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 2472 
 2473 void
 2474 vn_printf(struct vnode *vp, const char *fmt, ...)
 2475 {
 2476         va_list ap;
 2477         char buf[96];
 2478 
 2479         va_start(ap, fmt);
 2480         vprintf(fmt, ap);
 2481         va_end(ap);
 2482         printf("%p: ", (void *)vp);
 2483         printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 2484         printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
 2485             vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
 2486         buf[0] = '\0';
 2487         buf[1] = '\0';
 2488         if (vp->v_vflag & VV_ROOT)
 2489                 strcat(buf, "|VV_ROOT");
 2490         if (vp->v_vflag & VV_TEXT)
 2491                 strcat(buf, "|VV_TEXT");
 2492         if (vp->v_vflag & VV_SYSTEM)
 2493                 strcat(buf, "|VV_SYSTEM");
 2494         if (vp->v_iflag & VI_DOOMED)
 2495                 strcat(buf, "|VI_DOOMED");
 2496         if (vp->v_iflag & VI_FREE)
 2497                 strcat(buf, "|VI_FREE");
 2498         printf("    flags (%s)\n", buf + 1);
 2499         if (mtx_owned(VI_MTX(vp)))
 2500                 printf(" VI_LOCKed");
 2501         if (vp->v_object != NULL)
 2502                 printf("    v_object %p ref %d pages %d\n",
 2503                     vp->v_object, vp->v_object->ref_count,
 2504                     vp->v_object->resident_page_count);
 2505         printf("    ");
 2506         lockmgr_printinfo(vp->v_vnlock);
 2507         printf("\n");
 2508         if (vp->v_data != NULL)
 2509                 VOP_PRINT(vp);
 2510 }
 2511 
 2512 #ifdef DDB
 2513 #include <ddb/ddb.h>
 2514 /*
 2515  * List all of the locked vnodes in the system.
 2516  * Called when debugging the kernel.
 2517  */
 2518 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 2519 {
 2520         struct mount *mp, *nmp;
 2521         struct vnode *vp;
 2522 
 2523         /*
 2524          * Note: because this is DDB, we can't obey the locking semantics
 2525          * for these structures, which means we could catch an inconsistent
 2526          * state and dereference a nasty pointer.  Not much to be done
 2527          * about that.
 2528          */
 2529         printf("Locked vnodes\n");
 2530         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 2531                 nmp = TAILQ_NEXT(mp, mnt_list);
 2532                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 2533                         if (VOP_ISLOCKED(vp, NULL))
 2534                                 vprint("", vp);
 2535                 }
 2536                 nmp = TAILQ_NEXT(mp, mnt_list);
 2537         }
 2538 }
 2539 #endif
 2540 
 2541 /*
 2542  * Fill in a struct xvfsconf based on a struct vfsconf.
 2543  */
 2544 static void
 2545 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 2546 {
 2547 
 2548         strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 2549         xvfsp->vfc_typenum = vfsp->vfc_typenum;
 2550         xvfsp->vfc_refcount = vfsp->vfc_refcount;
 2551         xvfsp->vfc_flags = vfsp->vfc_flags;
 2552         /*
 2553          * These are unused in userland, we keep them
 2554          * to not break binary compatibility.
 2555          */
 2556         xvfsp->vfc_vfsops = NULL;
 2557         xvfsp->vfc_next = NULL;
 2558 }
 2559 
 2560 /*
 2561  * Top level filesystem related information gathering.
 2562  */
 2563 static int
 2564 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 2565 {
 2566         struct vfsconf *vfsp;
 2567         struct xvfsconf xvfsp;
 2568         int error;
 2569 
 2570         error = 0;
 2571         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 2572                 bzero(&xvfsp, sizeof(xvfsp));
 2573                 vfsconf2x(vfsp, &xvfsp);
 2574                 error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
 2575                 if (error)
 2576                         break;
 2577         }
 2578         return (error);
 2579 }
 2580 
 2581 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
 2582     "S,xvfsconf", "List of all configured filesystems");
 2583 
 2584 #ifndef BURN_BRIDGES
 2585 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 2586 
 2587 static int
 2588 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 2589 {
 2590         int *name = (int *)arg1 - 1;    /* XXX */
 2591         u_int namelen = arg2 + 1;       /* XXX */
 2592         struct vfsconf *vfsp;
 2593         struct xvfsconf xvfsp;
 2594 
 2595         printf("WARNING: userland calling deprecated sysctl, "
 2596             "please rebuild world\n");
 2597 
 2598 #if 1 || defined(COMPAT_PRELITE2)
 2599         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 2600         if (namelen == 1)
 2601                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 2602 #endif
 2603 
 2604         switch (name[1]) {
 2605         case VFS_MAXTYPENUM:
 2606                 if (namelen != 2)
 2607                         return (ENOTDIR);
 2608                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 2609         case VFS_CONF:
 2610                 if (namelen != 3)
 2611                         return (ENOTDIR);       /* overloaded */
 2612                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
 2613                         if (vfsp->vfc_typenum == name[2])
 2614                                 break;
 2615                 if (vfsp == NULL)
 2616                         return (EOPNOTSUPP);
 2617                 bzero(&xvfsp, sizeof(xvfsp));
 2618                 vfsconf2x(vfsp, &xvfsp);
 2619                 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 2620         }
 2621         return (EOPNOTSUPP);
 2622 }
 2623 
 2624 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
 2625         vfs_sysctl, "Generic filesystem");
 2626 
 2627 #if 1 || defined(COMPAT_PRELITE2)
 2628 
 2629 static int
 2630 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 2631 {
 2632         int error;
 2633         struct vfsconf *vfsp;
 2634         struct ovfsconf ovfs;
 2635 
 2636         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 2637                 bzero(&ovfs, sizeof(ovfs));
 2638                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
 2639                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
 2640                 ovfs.vfc_index = vfsp->vfc_typenum;
 2641                 ovfs.vfc_refcount = vfsp->vfc_refcount;
 2642                 ovfs.vfc_flags = vfsp->vfc_flags;
 2643                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 2644                 if (error)
 2645                         return error;
 2646         }
 2647         return 0;
 2648 }
 2649 
 2650 #endif /* 1 || COMPAT_PRELITE2 */
 2651 #endif /* !BURN_BRIDGES */
 2652 
 2653 #define KINFO_VNODESLOP         10
 2654 #ifdef notyet
 2655 /*
 2656  * Dump vnode list (via sysctl).
 2657  */
 2658 /* ARGSUSED */
 2659 static int
 2660 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 2661 {
 2662         struct xvnode *xvn;
 2663         struct thread *td = req->td;
 2664         struct mount *mp;
 2665         struct vnode *vp;
 2666         int error, len, n;
 2667 
 2668         /*
 2669          * Stale numvnodes access is not fatal here.
 2670          */
 2671         req->lock = 0;
 2672         len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 2673         if (!req->oldptr)
 2674                 /* Make an estimate */
 2675                 return (SYSCTL_OUT(req, 0, len));
 2676 
 2677         error = sysctl_wire_old_buffer(req, 0);
 2678         if (error != 0)
 2679                 return (error);
 2680         xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 2681         n = 0;
 2682         mtx_lock(&mountlist_mtx);
 2683         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 2684                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 2685                         continue;
 2686                 MNT_ILOCK(mp);
 2687                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 2688                         if (n == len)
 2689                                 break;
 2690                         vref(vp);
 2691                         xvn[n].xv_size = sizeof *xvn;
 2692                         xvn[n].xv_vnode = vp;
 2693                         xvn[n].xv_id = 0;       /* XXX compat */
 2694 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 2695                         XV_COPY(usecount);
 2696                         XV_COPY(writecount);
 2697                         XV_COPY(holdcnt);
 2698                         XV_COPY(mount);
 2699                         XV_COPY(numoutput);
 2700                         XV_COPY(type);
 2701 #undef XV_COPY
 2702                         xvn[n].xv_flag = vp->v_vflag;
 2703 
 2704                         switch (vp->v_type) {
 2705                         case VREG:
 2706                         case VDIR:
 2707                         case VLNK:
 2708                                 break;
 2709                         case VBLK:
 2710                         case VCHR:
 2711                                 if (vp->v_rdev == NULL) {
 2712                                         vrele(vp);
 2713                                         continue;
 2714                                 }
 2715                                 xvn[n].xv_dev = dev2udev(vp->v_rdev);
 2716                                 break;
 2717                         case VSOCK:
 2718                                 xvn[n].xv_socket = vp->v_socket;
 2719                                 break;
 2720                         case VFIFO:
 2721                                 xvn[n].xv_fifo = vp->v_fifoinfo;
 2722                                 break;
 2723                         case VNON:
 2724                         case VBAD:
 2725                         default:
 2726                                 /* shouldn't happen? */
 2727                                 vrele(vp);
 2728                                 continue;
 2729                         }
 2730                         vrele(vp);
 2731                         ++n;
 2732                 }
 2733                 MNT_IUNLOCK(mp);
 2734                 mtx_lock(&mountlist_mtx);
 2735                 vfs_unbusy(mp, td);
 2736                 if (n == len)
 2737                         break;
 2738         }
 2739         mtx_unlock(&mountlist_mtx);
 2740 
 2741         error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 2742         free(xvn, M_TEMP);
 2743         return (error);
 2744 }
 2745 
 2746 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 2747         0, 0, sysctl_vnode, "S,xvnode", "");
 2748 #endif
 2749 
 2750 /*
 2751  * Unmount all filesystems. The list is traversed in reverse order
 2752  * of mounting to avoid dependencies.
 2753  */
 2754 void
 2755 vfs_unmountall()
 2756 {
 2757         struct mount *mp;
 2758         struct thread *td;
 2759         int error;
 2760 
 2761         KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
 2762         td = curthread;
 2763         /*
 2764          * Since this only runs when rebooting, it is not interlocked.
 2765          */
 2766         while(!TAILQ_EMPTY(&mountlist)) {
 2767                 mp = TAILQ_LAST(&mountlist, mntlist);
 2768                 error = dounmount(mp, MNT_FORCE, td);
 2769                 if (error) {
 2770                         TAILQ_REMOVE(&mountlist, mp, mnt_list);
 2771                         /*
 2772                          * XXX: Due to the way in which we mount the root
 2773                          * file system off of devfs, devfs will generate a
 2774                          * "busy" warning when we try to unmount it before
 2775                          * the root.  Don't print a warning as a result in
 2776                          * order to avoid false positive errors that may
 2777                          * cause needless upset.
 2778                          */
 2779                         if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
 2780                                 printf("unmount of %s failed (",
 2781                                     mp->mnt_stat.f_mntonname);
 2782                                 if (error == EBUSY)
 2783                                         printf("BUSY)\n");
 2784                                 else
 2785                                         printf("%d)\n", error);
 2786                         }
 2787                 } else {
 2788                         /* The unmount has removed mp from the mountlist */
 2789                 }
 2790         }
 2791 }
 2792 
 2793 /*
 2794  * perform msync on all vnodes under a mount point
 2795  * the mount point must be locked.
 2796  */
 2797 void
 2798 vfs_msync(struct mount *mp, int flags)
 2799 {
 2800         struct vnode *vp, *nvp;
 2801         struct vm_object *obj;
 2802         int tries;
 2803 
 2804         tries = 5;
 2805         MNT_ILOCK(mp);
 2806 loop:
 2807         TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) {
 2808                 if (vp->v_mount != mp) {
 2809                         if (--tries > 0)
 2810                                 goto loop;
 2811                         break;
 2812                 }
 2813 
 2814                 VI_LOCK(vp);
 2815                 if ((vp->v_iflag & VI_OBJDIRTY) &&
 2816                     (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 2817                         MNT_IUNLOCK(mp);
 2818                         if (!vget(vp,
 2819                             LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 2820                             curthread)) {
 2821                                 if (vp->v_vflag & VV_NOSYNC) {  /* unlinked */
 2822                                         vput(vp);
 2823                                         MNT_ILOCK(mp);
 2824                                         continue;
 2825                                 }
 2826 
 2827                                 obj = vp->v_object;
 2828                                 if (obj != NULL) {
 2829                                         VM_OBJECT_LOCK(obj);
 2830                                         vm_object_page_clean(obj, 0, 0,
 2831                                             flags == MNT_WAIT ?
 2832                                             OBJPC_SYNC : OBJPC_NOSYNC);
 2833                                         VM_OBJECT_UNLOCK(obj);
 2834                                 }
 2835                                 vput(vp);
 2836                         }
 2837                         MNT_ILOCK(mp);
 2838                         if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
 2839                                 if (--tries > 0)
 2840                                         goto loop;
 2841                                 break;
 2842                         }
 2843                 } else
 2844                         VI_UNLOCK(vp);
 2845         }
 2846         MNT_IUNLOCK(mp);
 2847 }
 2848 
 2849 /*
 2850  * Mark a vnode as free, putting it up for recycling.
 2851  */
 2852 static void
 2853 vfree(struct vnode *vp)
 2854 {
 2855 
 2856         CTR1(KTR_VFS, "vfree vp %p", vp);
 2857         ASSERT_VI_LOCKED(vp, "vfree");
 2858         mtx_lock(&vnode_free_list_mtx);
 2859         VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
 2860         VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
 2861         VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
 2862         VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
 2863             ("vfree: Freeing doomed vnode"));
 2864         if (vp->v_iflag & VI_AGE) {
 2865                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 2866         } else {
 2867                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 2868         }
 2869         freevnodes++;
 2870         vp->v_iflag &= ~VI_AGE;
 2871         vp->v_iflag |= VI_FREE;
 2872         mtx_unlock(&vnode_free_list_mtx);
 2873 }
 2874 
 2875 /*
 2876  * Opposite of vfree() - mark a vnode as in use.
 2877  */
 2878 static void
 2879 vbusy(struct vnode *vp)
 2880 {
 2881         CTR1(KTR_VFS, "vbusy vp %p", vp);
 2882         ASSERT_VI_LOCKED(vp, "vbusy");
 2883         VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
 2884         VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
 2885 
 2886         mtx_lock(&vnode_free_list_mtx);
 2887         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 2888         freevnodes--;
 2889         vp->v_iflag &= ~(VI_FREE|VI_AGE);
 2890         mtx_unlock(&vnode_free_list_mtx);
 2891 }
 2892 
 2893 /*
 2894  * Initalize per-vnode helper structure to hold poll-related state.
 2895  */
 2896 void
 2897 v_addpollinfo(struct vnode *vp)
 2898 {
 2899         struct vpollinfo *vi;
 2900 
 2901         vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 2902         if (vp->v_pollinfo != NULL) {
 2903                 uma_zfree(vnodepoll_zone, vi);
 2904                 return;
 2905         }
 2906         vp->v_pollinfo = vi;
 2907         mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 2908         knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
 2909             vfs_knlunlock, vfs_knllocked);
 2910 }
 2911 
 2912 /*
 2913  * Record a process's interest in events which might happen to
 2914  * a vnode.  Because poll uses the historic select-style interface
 2915  * internally, this routine serves as both the ``check for any
 2916  * pending events'' and the ``record my interest in future events''
 2917  * functions.  (These are done together, while the lock is held,
 2918  * to avoid race conditions.)
 2919  */
 2920 int
 2921 vn_pollrecord(vp, td, events)
 2922         struct vnode *vp;
 2923         struct thread *td;
 2924         short events;
 2925 {
 2926 
 2927         if (vp->v_pollinfo == NULL)
 2928                 v_addpollinfo(vp);
 2929         mtx_lock(&vp->v_pollinfo->vpi_lock);
 2930         if (vp->v_pollinfo->vpi_revents & events) {
 2931                 /*
 2932                  * This leaves events we are not interested
 2933                  * in available for the other process which
 2934                  * which presumably had requested them
 2935                  * (otherwise they would never have been
 2936                  * recorded).
 2937                  */
 2938                 events &= vp->v_pollinfo->vpi_revents;
 2939                 vp->v_pollinfo->vpi_revents &= ~events;
 2940 
 2941                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
 2942                 return events;
 2943         }
 2944         vp->v_pollinfo->vpi_events |= events;
 2945         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 2946         mtx_unlock(&vp->v_pollinfo->vpi_lock);
 2947         return 0;
 2948 }
 2949 
 2950 /*
 2951  * Routine to create and manage a filesystem syncer vnode.
 2952  */
 2953 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 2954 static int      sync_fsync(struct  vop_fsync_args *);
 2955 static int      sync_inactive(struct  vop_inactive_args *);
 2956 static int      sync_reclaim(struct  vop_reclaim_args *);
 2957 
 2958 static struct vop_vector sync_vnodeops = {
 2959         .vop_bypass =   VOP_EOPNOTSUPP,
 2960         .vop_close =    sync_close,             /* close */
 2961         .vop_fsync =    sync_fsync,             /* fsync */
 2962         .vop_inactive = sync_inactive,  /* inactive */
 2963         .vop_reclaim =  sync_reclaim,   /* reclaim */
 2964         .vop_lock =     vop_stdlock,    /* lock */
 2965         .vop_unlock =   vop_stdunlock,  /* unlock */
 2966         .vop_islocked = vop_stdislocked,        /* islocked */
 2967 };
 2968 
 2969 /*
 2970  * Create a new filesystem syncer vnode for the specified mount point.
 2971  */
 2972 int
 2973 vfs_allocate_syncvnode(mp)
 2974         struct mount *mp;
 2975 {
 2976         struct vnode *vp;
 2977         static long start, incr, next;
 2978         int error;
 2979 
 2980         /* Allocate a new vnode */
 2981         if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
 2982                 mp->mnt_syncer = NULL;
 2983                 return (error);
 2984         }
 2985         vp->v_type = VNON;
 2986         /*
 2987          * Place the vnode onto the syncer worklist. We attempt to
 2988          * scatter them about on the list so that they will go off
 2989          * at evenly distributed times even if all the filesystems
 2990          * are mounted at once.
 2991          */
 2992         next += incr;
 2993         if (next == 0 || next > syncer_maxdelay) {
 2994                 start /= 2;
 2995                 incr /= 2;
 2996                 if (start == 0) {
 2997                         start = syncer_maxdelay / 2;
 2998                         incr = syncer_maxdelay;
 2999                 }
 3000                 next = start;
 3001         }
 3002         VI_LOCK(vp);
 3003         vn_syncer_add_to_worklist(&vp->v_bufobj,
 3004             syncdelay > 0 ? next % syncdelay : 0);
 3005         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 3006         mtx_lock(&sync_mtx);
 3007         sync_vnode_count++;
 3008         mtx_unlock(&sync_mtx);
 3009         VI_UNLOCK(vp);
 3010         mp->mnt_syncer = vp;
 3011         return (0);
 3012 }
 3013 
 3014 /*
 3015  * Do a lazy sync of the filesystem.
 3016  */
 3017 static int
 3018 sync_fsync(ap)
 3019         struct vop_fsync_args /* {
 3020                 struct vnode *a_vp;
 3021                 struct ucred *a_cred;
 3022                 int a_waitfor;
 3023                 struct thread *a_td;
 3024         } */ *ap;
 3025 {
 3026         struct vnode *syncvp = ap->a_vp;
 3027         struct mount *mp = syncvp->v_mount;
 3028         struct thread *td = ap->a_td;
 3029         int error, asyncflag;
 3030         struct bufobj *bo;
 3031 
 3032         /*
 3033          * We only need to do something if this is a lazy evaluation.
 3034          */
 3035         if (ap->a_waitfor != MNT_LAZY)
 3036                 return (0);
 3037 
 3038         /*
 3039          * Move ourselves to the back of the sync list.
 3040          */
 3041         bo = &syncvp->v_bufobj;
 3042         BO_LOCK(bo);
 3043         vn_syncer_add_to_worklist(bo, syncdelay);
 3044         BO_UNLOCK(bo);
 3045 
 3046         /*
 3047          * Walk the list of vnodes pushing all that are dirty and
 3048          * not already on the sync list.
 3049          */
 3050         mtx_lock(&mountlist_mtx);
 3051         if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 3052                 mtx_unlock(&mountlist_mtx);
 3053                 return (0);
 3054         }
 3055         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 3056                 vfs_unbusy(mp, td);
 3057                 return (0);
 3058         }
 3059         asyncflag = mp->mnt_flag & MNT_ASYNC;
 3060         mp->mnt_flag &= ~MNT_ASYNC;
 3061         vfs_msync(mp, MNT_NOWAIT);
 3062         error = VFS_SYNC(mp, MNT_LAZY, td);
 3063         if (asyncflag)
 3064                 mp->mnt_flag |= MNT_ASYNC;
 3065         vn_finished_write(mp);
 3066         vfs_unbusy(mp, td);
 3067         return (error);
 3068 }
 3069 
 3070 /*
 3071  * The syncer vnode is no referenced.
 3072  */
 3073 static int
 3074 sync_inactive(ap)
 3075         struct vop_inactive_args /* {
 3076                 struct vnode *a_vp;
 3077                 struct thread *a_td;
 3078         } */ *ap;
 3079 {
 3080 
 3081         vgone(ap->a_vp);
 3082         return (0);
 3083 }
 3084 
 3085 /*
 3086  * The syncer vnode is no longer needed and is being decommissioned.
 3087  *
 3088  * Modifications to the worklist must be protected by sync_mtx.
 3089  */
 3090 static int
 3091 sync_reclaim(ap)
 3092         struct vop_reclaim_args /* {
 3093                 struct vnode *a_vp;
 3094         } */ *ap;
 3095 {
 3096         struct vnode *vp = ap->a_vp;
 3097         struct bufobj *bo;
 3098 
 3099         VI_LOCK(vp);
 3100         bo = &vp->v_bufobj;
 3101         vp->v_mount->mnt_syncer = NULL;
 3102         if (bo->bo_flag & BO_ONWORKLST) {
 3103                 mtx_lock(&sync_mtx);
 3104                 LIST_REMOVE(bo, bo_synclist);
 3105                 syncer_worklist_len--;
 3106                 sync_vnode_count--;
 3107                 mtx_unlock(&sync_mtx);
 3108                 bo->bo_flag &= ~BO_ONWORKLST;
 3109         }
 3110         VI_UNLOCK(vp);
 3111 
 3112         return (0);
 3113 }
 3114 
 3115 /*
 3116  * Check if vnode represents a disk device
 3117  */
 3118 int
 3119 vn_isdisk(vp, errp)
 3120         struct vnode *vp;
 3121         int *errp;
 3122 {
 3123         int error;
 3124 
 3125         error = 0;
 3126         dev_lock();
 3127         if (vp->v_type != VCHR)
 3128                 error = ENOTBLK;
 3129         else if (vp->v_rdev == NULL)
 3130                 error = ENXIO;
 3131         else if (vp->v_rdev->si_devsw == NULL)
 3132                 error = ENXIO;
 3133         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 3134                 error = ENOTBLK;
 3135         dev_unlock();
 3136         if (errp != NULL)
 3137                 *errp = error;
 3138         return (error == 0);
 3139 }
 3140 
 3141 /*
 3142  * Common filesystem object access control check routine.  Accepts a
 3143  * vnode's type, "mode", uid and gid, requested access mode, credentials,
 3144  * and optional call-by-reference privused argument allowing vaccess()
 3145  * to indicate to the caller whether privilege was used to satisfy the
 3146  * request (obsoleted).  Returns 0 on success, or an errno on failure.
 3147  */
 3148 int
 3149 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
 3150         enum vtype type;
 3151         mode_t file_mode;
 3152         uid_t file_uid;
 3153         gid_t file_gid;
 3154         mode_t acc_mode;
 3155         struct ucred *cred;
 3156         int *privused;
 3157 {
 3158         mode_t dac_granted;
 3159 #ifdef CAPABILITIES
 3160         mode_t cap_granted;
 3161 #endif
 3162 
 3163         /*
 3164          * Look for a normal, non-privileged way to access the file/directory
 3165          * as requested.  If it exists, go with that.
 3166          */
 3167 
 3168         if (privused != NULL)
 3169                 *privused = 0;
 3170 
 3171         dac_granted = 0;
 3172 
 3173         /* Check the owner. */
 3174         if (cred->cr_uid == file_uid) {
 3175                 dac_granted |= VADMIN;
 3176                 if (file_mode & S_IXUSR)
 3177                         dac_granted |= VEXEC;
 3178                 if (file_mode & S_IRUSR)
 3179                         dac_granted |= VREAD;
 3180                 if (file_mode & S_IWUSR)
 3181                         dac_granted |= (VWRITE | VAPPEND);
 3182 
 3183                 if ((acc_mode & dac_granted) == acc_mode)
 3184                         return (0);
 3185 
 3186                 goto privcheck;
 3187         }
 3188 
 3189         /* Otherwise, check the groups (first match) */
 3190         if (groupmember(file_gid, cred)) {
 3191                 if (file_mode & S_IXGRP)
 3192                         dac_granted |= VEXEC;
 3193                 if (file_mode & S_IRGRP)
 3194                         dac_granted |= VREAD;
 3195                 if (file_mode & S_IWGRP)
 3196                         dac_granted |= (VWRITE | VAPPEND);
 3197 
 3198                 if ((acc_mode & dac_granted) == acc_mode)
 3199                         return (0);
 3200 
 3201                 goto privcheck;
 3202         }
 3203 
 3204         /* Otherwise, check everyone else. */
 3205         if (file_mode & S_IXOTH)
 3206                 dac_granted |= VEXEC;
 3207         if (file_mode & S_IROTH)
 3208                 dac_granted |= VREAD;
 3209         if (file_mode & S_IWOTH)
 3210                 dac_granted |= (VWRITE | VAPPEND);
 3211         if ((acc_mode & dac_granted) == acc_mode)
 3212                 return (0);
 3213 
 3214 privcheck:
 3215         if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
 3216                 /* XXX audit: privilege used */
 3217                 if (privused != NULL)
 3218                         *privused = 1;
 3219                 return (0);
 3220         }
 3221 
 3222 #ifdef CAPABILITIES
 3223         /*
 3224          * Build a capability mask to determine if the set of capabilities
 3225          * satisfies the requirements when combined with the granted mask
 3226          * from above.
 3227          * For each capability, if the capability is required, bitwise
 3228          * or the request type onto the cap_granted mask.
 3229          */
 3230         cap_granted = 0;
 3231 
 3232         if (type == VDIR) {
 3233                 /*
 3234                  * For directories, use CAP_DAC_READ_SEARCH to satisfy
 3235                  * VEXEC requests, instead of CAP_DAC_EXECUTE.
 3236                  */
 3237                 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 3238                     !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 3239                         cap_granted |= VEXEC;
 3240         } else {
 3241                 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 3242                     !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
 3243                         cap_granted |= VEXEC;
 3244         }
 3245 
 3246         if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 3247             !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 3248                 cap_granted |= VREAD;
 3249 
 3250         if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 3251             !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
 3252                 cap_granted |= (VWRITE | VAPPEND);
 3253 
 3254         if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 3255             !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
 3256                 cap_granted |= VADMIN;
 3257 
 3258         if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
 3259                 /* XXX audit: privilege used */
 3260                 if (privused != NULL)
 3261                         *privused = 1;
 3262                 return (0);
 3263         }
 3264 #endif
 3265 
 3266         return ((acc_mode & VADMIN) ? EPERM : EACCES);
 3267 }
 3268 
 3269 /*
 3270  * Credential check based on process requesting service, and per-attribute
 3271  * permissions.
 3272  */
 3273 int
 3274 extattr_check_cred(struct vnode *vp, int attrnamespace,
 3275     struct ucred *cred, struct thread *td, int access)
 3276 {
 3277 
 3278         /*
 3279          * Kernel-invoked always succeeds.
 3280          */
 3281         if (cred == NOCRED)
 3282                 return (0);
 3283 
 3284         /*
 3285          * Do not allow privileged processes in jail to directly
 3286          * manipulate system attributes.
 3287          *
 3288          * XXX What capability should apply here?
 3289          * Probably CAP_SYS_SETFFLAG.
 3290          */
 3291         switch (attrnamespace) {
 3292         case EXTATTR_NAMESPACE_SYSTEM:
 3293                 /* Potentially should be: return (EPERM); */
 3294                 return (suser_cred(cred, 0));
 3295         case EXTATTR_NAMESPACE_USER:
 3296                 return (VOP_ACCESS(vp, access, cred, td));
 3297         default:
 3298                 return (EPERM);
 3299         }
 3300 }
 3301 
 3302 #ifdef DEBUG_VFS_LOCKS
 3303 /*
 3304  * This only exists to supress warnings from unlocked specfs accesses.  It is
 3305  * no longer ok to have an unlocked VFS.
 3306  */
 3307 #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 3308 
 3309 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
 3310 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
 3311 
 3312 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
 3313 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
 3314 
 3315 int vfs_badlock_print = 1;      /* Print lock violations. */
 3316 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
 3317 
 3318 #ifdef KDB
 3319 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
 3320 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
 3321 #endif
 3322 
 3323 static void
 3324 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 3325 {
 3326 
 3327 #ifdef KDB
 3328         if (vfs_badlock_backtrace)
 3329                 kdb_backtrace();
 3330 #endif
 3331         if (vfs_badlock_print)
 3332                 printf("%s: %p %s\n", str, (void *)vp, msg);
 3333         if (vfs_badlock_ddb)
 3334                 kdb_enter("lock violation");
 3335 }
 3336 
 3337 void
 3338 assert_vi_locked(struct vnode *vp, const char *str)
 3339 {
 3340 
 3341         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 3342                 vfs_badlock("interlock is not locked but should be", str, vp);
 3343 }
 3344 
 3345 void
 3346 assert_vi_unlocked(struct vnode *vp, const char *str)
 3347 {
 3348 
 3349         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 3350                 vfs_badlock("interlock is locked but should not be", str, vp);
 3351 }
 3352 
 3353 void
 3354 assert_vop_locked(struct vnode *vp, const char *str)
 3355 {
 3356 
 3357         if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
 3358                 vfs_badlock("is not locked but should be", str, vp);
 3359 }
 3360 
 3361 void
 3362 assert_vop_unlocked(struct vnode *vp, const char *str)
 3363 {
 3364 
 3365         if (vp && !IGNORE_LOCK(vp) &&
 3366             VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 3367                 vfs_badlock("is locked but should not be", str, vp);
 3368 }
 3369 
 3370 void
 3371 assert_vop_elocked(struct vnode *vp, const char *str)
 3372 {
 3373 
 3374         if (vp && !IGNORE_LOCK(vp) &&
 3375             VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 3376                 vfs_badlock("is not exclusive locked but should be", str, vp);
 3377 }
 3378 
 3379 #if 0
 3380 void
 3381 assert_vop_elocked_other(struct vnode *vp, const char *str)
 3382 {
 3383 
 3384         if (vp && !IGNORE_LOCK(vp) &&
 3385             VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 3386                 vfs_badlock("is not exclusive locked by another thread",
 3387                     str, vp);
 3388 }
 3389 
 3390 void
 3391 assert_vop_slocked(struct vnode *vp, const char *str)
 3392 {
 3393 
 3394         if (vp && !IGNORE_LOCK(vp) &&
 3395             VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 3396                 vfs_badlock("is not locked shared but should be", str, vp);
 3397 }
 3398 #endif /* 0 */
 3399 #endif /* DEBUG_VFS_LOCKS */
 3400 
 3401 void
 3402 vop_rename_pre(void *ap)
 3403 {
 3404         struct vop_rename_args *a = ap;
 3405 
 3406 #ifdef DEBUG_VFS_LOCKS
 3407         if (a->a_tvp)
 3408                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 3409         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 3410         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 3411         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 3412 
 3413         /* Check the source (from). */
 3414         if (a->a_tdvp != a->a_fdvp)
 3415                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 3416         if (a->a_tvp != a->a_fvp)
 3417                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
 3418 
 3419         /* Check the target. */
 3420         if (a->a_tvp)
 3421                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 3422         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 3423 #endif
 3424         if (a->a_tdvp != a->a_fdvp)
 3425                 vhold(a->a_fdvp);
 3426         if (a->a_tvp != a->a_fvp)
 3427                 vhold(a->a_fvp);
 3428         vhold(a->a_tdvp);
 3429         if (a->a_tvp)
 3430                 vhold(a->a_tvp);
 3431 }
 3432 
 3433 void
 3434 vop_strategy_pre(void *ap)
 3435 {
 3436 #ifdef DEBUG_VFS_LOCKS
 3437         struct vop_strategy_args *a;
 3438         struct buf *bp;
 3439 
 3440         a = ap;
 3441         bp = a->a_bp;
 3442 
 3443         /*
 3444          * Cluster ops lock their component buffers but not the IO container.
 3445          */
 3446         if ((bp->b_flags & B_CLUSTER) != 0)
 3447                 return;
 3448 
 3449         if (BUF_REFCNT(bp) < 1) {
 3450                 if (vfs_badlock_print)
 3451                         printf(
 3452                             "VOP_STRATEGY: bp is not locked but should be\n");
 3453                 if (vfs_badlock_ddb)
 3454                         kdb_enter("lock violation");
 3455         }
 3456 #endif
 3457 }
 3458 
 3459 void
 3460 vop_lookup_pre(void *ap)
 3461 {
 3462 #ifdef DEBUG_VFS_LOCKS
 3463         struct vop_lookup_args *a;
 3464         struct vnode *dvp;
 3465 
 3466         a = ap;
 3467         dvp = a->a_dvp;
 3468         ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 3469         ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 3470 #endif
 3471 }
 3472 
 3473 void
 3474 vop_lookup_post(void *ap, int rc)
 3475 {
 3476 #ifdef DEBUG_VFS_LOCKS
 3477         struct vop_lookup_args *a;
 3478         struct vnode *dvp;
 3479         struct vnode *vp;
 3480 
 3481         a = ap;
 3482         dvp = a->a_dvp;
 3483         vp = *(a->a_vpp);
 3484 
 3485         ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 3486         ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 3487 
 3488         if (!rc)
 3489                 ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
 3490 #endif
 3491 }
 3492 
 3493 void
 3494 vop_lock_pre(void *ap)
 3495 {
 3496 #ifdef DEBUG_VFS_LOCKS
 3497         struct vop_lock_args *a = ap;
 3498 
 3499         if ((a->a_flags & LK_INTERLOCK) == 0)
 3500                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 3501         else
 3502                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 3503 #endif
 3504 }
 3505 
 3506 void
 3507 vop_lock_post(void *ap, int rc)
 3508 {
 3509 #ifdef DEBUG_VFS_LOCKS
 3510         struct vop_lock_args *a = ap;
 3511 
 3512         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 3513         if (rc == 0)
 3514                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 3515 #endif
 3516 }
 3517 
 3518 void
 3519 vop_unlock_pre(void *ap)
 3520 {
 3521 #ifdef DEBUG_VFS_LOCKS
 3522         struct vop_unlock_args *a = ap;
 3523 
 3524         if (a->a_flags & LK_INTERLOCK)
 3525                 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 3526         ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 3527 #endif
 3528 }
 3529 
 3530 void
 3531 vop_unlock_post(void *ap, int rc)
 3532 {
 3533 #ifdef DEBUG_VFS_LOCKS
 3534         struct vop_unlock_args *a = ap;
 3535 
 3536         if (a->a_flags & LK_INTERLOCK)
 3537                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 3538 #endif
 3539 }
 3540 
 3541 void
 3542 vop_create_post(void *ap, int rc)
 3543 {
 3544         struct vop_create_args *a = ap;
 3545 
 3546         if (!rc)
 3547                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 
 3548 }
 3549 
 3550 void
 3551 vop_link_post(void *ap, int rc)
 3552 {
 3553         struct vop_link_args *a = ap;
 3554         
 3555         if (!rc) {
 3556                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 
 3557                 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 3558         }
 3559 }
 3560 
 3561 void
 3562 vop_mkdir_post(void *ap, int rc)
 3563 {
 3564         struct vop_mkdir_args *a = ap;
 3565 
 3566         if (!rc)
 3567                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 3568 }
 3569 
 3570 void
 3571 vop_mknod_post(void *ap, int rc)
 3572 {
 3573         struct vop_mknod_args *a = ap;
 3574 
 3575         if (!rc)
 3576                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 3577 }
 3578 
 3579 void
 3580 vop_remove_post(void *ap, int rc)
 3581 {
 3582         struct vop_remove_args *a = ap;
 3583 
 3584         if (!rc) {
 3585                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 3586                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 3587         }
 3588 }
 3589 
 3590 void
 3591 vop_rename_post(void *ap, int rc)
 3592 {
 3593         struct vop_rename_args *a = ap;
 3594 
 3595         if (!rc) {
 3596                 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
 3597                 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
 3598                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 3599                 if (a->a_tvp)
 3600                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 3601         }
 3602         if (a->a_tdvp != a->a_fdvp)
 3603                 vdrop(a->a_fdvp);
 3604         if (a->a_tvp != a->a_fvp)
 3605                 vdrop(a->a_fvp);
 3606         vdrop(a->a_tdvp);
 3607         if (a->a_tvp)
 3608                 vdrop(a->a_tvp);
 3609 }
 3610 
 3611 void
 3612 vop_rmdir_post(void *ap, int rc)
 3613 {
 3614         struct vop_rmdir_args *a = ap;
 3615 
 3616         if (!rc) {
 3617                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 3618                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 3619         }
 3620 }
 3621 
 3622 void
 3623 vop_setattr_post(void *ap, int rc)
 3624 {
 3625         struct vop_setattr_args *a = ap;
 3626 
 3627         if (!rc)
 3628                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 3629 }
 3630 
 3631 void
 3632 vop_symlink_post(void *ap, int rc)
 3633 {
 3634         struct vop_symlink_args *a = ap;
 3635         
 3636         if (!rc)
 3637                 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 3638 }
 3639 
 3640 static struct knlist fs_knlist;
 3641 
 3642 static void
 3643 vfs_event_init(void *arg)
 3644 {
 3645         knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
 3646 }
 3647 /* XXX - correct order? */
 3648 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 3649 
 3650 void
 3651 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
 3652 {
 3653 
 3654         KNOTE_UNLOCKED(&fs_knlist, event);
 3655 }
 3656 
 3657 static int      filt_fsattach(struct knote *kn);
 3658 static void     filt_fsdetach(struct knote *kn);
 3659 static int      filt_fsevent(struct knote *kn, long hint);
 3660 
 3661 struct filterops fs_filtops =
 3662         { 0, filt_fsattach, filt_fsdetach, filt_fsevent };
 3663 
 3664 static int
 3665 filt_fsattach(struct knote *kn)
 3666 {
 3667 
 3668         kn->kn_flags |= EV_CLEAR;
 3669         knlist_add(&fs_knlist, kn, 0);
 3670         return (0);
 3671 }
 3672 
 3673 static void
 3674 filt_fsdetach(struct knote *kn)
 3675 {
 3676 
 3677         knlist_remove(&fs_knlist, kn, 0);
 3678 }
 3679 
 3680 static int
 3681 filt_fsevent(struct knote *kn, long hint)
 3682 {
 3683 
 3684         kn->kn_fflags |= hint;
 3685         return (kn->kn_fflags != 0);
 3686 }
 3687 
 3688 static int
 3689 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 3690 {
 3691         struct vfsidctl vc;
 3692         int error;
 3693         struct mount *mp;
 3694 
 3695         error = SYSCTL_IN(req, &vc, sizeof(vc));
 3696         if (error)
 3697                 return (error);
 3698         if (vc.vc_vers != VFS_CTL_VERS1)
 3699                 return (EINVAL);
 3700         mp = vfs_getvfs(&vc.vc_fsid);
 3701         if (mp == NULL)
 3702                 return (ENOENT);
 3703         /* ensure that a specific sysctl goes to the right filesystem. */
 3704         if (strcmp(vc.vc_fstypename, "*") != 0 &&
 3705             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 3706                 return (EINVAL);
 3707         }
 3708         VCTLTOREQ(&vc, req);
 3709         return (VFS_SYSCTL(mp, vc.vc_op, req));
 3710 }
 3711 
 3712 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
 3713         NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
 3714 
 3715 /*
 3716  * Function to initialize a va_filerev field sensibly.
 3717  * XXX: Wouldn't a random number make a lot more sense ??
 3718  */
 3719 u_quad_t
 3720 init_va_filerev(void)
 3721 {
 3722         struct bintime bt;
 3723 
 3724         getbinuptime(&bt);
 3725         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 3726 }
 3727 
 3728 static int      filt_vfsread(struct knote *kn, long hint);
 3729 static int      filt_vfswrite(struct knote *kn, long hint);
 3730 static int      filt_vfsvnode(struct knote *kn, long hint);
 3731 static void     filt_vfsdetach(struct knote *kn);
 3732 static struct filterops vfsread_filtops =
 3733         { 1, NULL, filt_vfsdetach, filt_vfsread };
 3734 static struct filterops vfswrite_filtops =
 3735         { 1, NULL, filt_vfsdetach, filt_vfswrite };
 3736 static struct filterops vfsvnode_filtops =
 3737         { 1, NULL, filt_vfsdetach, filt_vfsvnode };
 3738 
 3739 static void
 3740 vfs_knllock(void *arg)
 3741 {
 3742         struct vnode *vp = arg;
 3743 
 3744         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 3745 }
 3746 
 3747 static void
 3748 vfs_knlunlock(void *arg)
 3749 {
 3750         struct vnode *vp = arg;
 3751 
 3752         VOP_UNLOCK(vp, 0, curthread);
 3753 }
 3754 
 3755 static int
 3756 vfs_knllocked(void *arg)
 3757 {
 3758         struct vnode *vp = arg;
 3759 
 3760         return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE);
 3761 }
 3762 
 3763 int
 3764 vfs_kqfilter(struct vop_kqfilter_args *ap)
 3765 {
 3766         struct vnode *vp = ap->a_vp;
 3767         struct knote *kn = ap->a_kn;
 3768         struct knlist *knl; 
 3769 
 3770         switch (kn->kn_filter) {
 3771         case EVFILT_READ:
 3772                 kn->kn_fop = &vfsread_filtops;
 3773                 break;
 3774         case EVFILT_WRITE:
 3775                 kn->kn_fop = &vfswrite_filtops;
 3776                 break;
 3777         case EVFILT_VNODE:
 3778                 kn->kn_fop = &vfsvnode_filtops;
 3779                 break;
 3780         default:
 3781                 return (EINVAL);
 3782         }
 3783 
 3784         kn->kn_hook = (caddr_t)vp;
 3785 
 3786         if (vp->v_pollinfo == NULL)
 3787                 v_addpollinfo(vp);
 3788         if (vp->v_pollinfo == NULL)
 3789                 return (ENOMEM);
 3790         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 3791         knlist_add(knl, kn, 0);
 3792 
 3793         return (0);
 3794 }
 3795 
 3796 /*
 3797  * Detach knote from vnode
 3798  */
 3799 static void
 3800 filt_vfsdetach(struct knote *kn)
 3801 {
 3802         struct vnode *vp = (struct vnode *)kn->kn_hook;
 3803 
 3804         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 3805         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 3806 }
 3807 
 3808 /*ARGSUSED*/
 3809 static int
 3810 filt_vfsread(struct knote *kn, long hint)
 3811 {
 3812         struct vnode *vp = (struct vnode *)kn->kn_hook;
 3813         struct vattr va;
 3814 
 3815         /*
 3816          * filesystem is gone, so set the EOF flag and schedule
 3817          * the knote for deletion.
 3818          */
 3819         if (hint == NOTE_REVOKE) {
 3820                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 3821                 return (1);
 3822         }
 3823 
 3824         if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) 
 3825                 return (0);
 3826 
 3827         kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 3828         return (kn->kn_data != 0);
 3829 }
 3830 
 3831 /*ARGSUSED*/
 3832 static int
 3833 filt_vfswrite(struct knote *kn, long hint)
 3834 {
 3835         /*
 3836          * filesystem is gone, so set the EOF flag and schedule
 3837          * the knote for deletion.
 3838          */
 3839         if (hint == NOTE_REVOKE)
 3840                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 3841 
 3842         kn->kn_data = 0;
 3843         return (1);
 3844 }
 3845 
 3846 static int
 3847 filt_vfsvnode(struct knote *kn, long hint)
 3848 {
 3849         if (kn->kn_sfflags & hint)
 3850                 kn->kn_fflags |= hint;
 3851         if (hint == NOTE_REVOKE) {
 3852                 kn->kn_flags |= EV_EOF;
 3853                 return (1);
 3854         }
 3855         return (kn->kn_fflags != 0);
 3856 }
 3857 
 3858 int
 3859 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 3860 {
 3861         int error;
 3862 
 3863         if (dp->d_reclen > ap->a_uio->uio_resid)
 3864                 return (ENAMETOOLONG);
 3865         error = uiomove(dp, dp->d_reclen, ap->a_uio);
 3866         if (error) {
 3867                 if (ap->a_ncookies != NULL) {
 3868                         if (ap->a_cookies != NULL)
 3869                                 free(ap->a_cookies, M_TEMP);
 3870                         ap->a_cookies = NULL;
 3871                         *ap->a_ncookies = 0;
 3872                 }
 3873                 return (error);
 3874         }
 3875         if (ap->a_ncookies == NULL)
 3876                 return (0);
 3877         *ap->a_cookies = realloc(*ap->a_cookies,
 3878             (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 3879         (*ap->a_cookies)[*ap->a_ncookies] = off;
 3880         return (0);
 3881 }
 3882
Cache object: 35e0c3826d5a6a70b79711d04cf68577
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_subr.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_subr.c