vfs_subr.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: vfs_subr.c,v 1.276.2.3 2007/08/01 14:45:47 liamjfoy Exp $      */
    2 
    3 /*-
    4  * Copyright (c) 1997, 1998, 2004, 2005 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
    9  * NASA Ames Research Center.
   10  * This code is derived from software contributed to The NetBSD Foundation
   11  * by Charles M. Hannum.
   12  *
   13  * Redistribution and use in source and binary forms, with or without
   14  * modification, are permitted provided that the following conditions
   15  * are met:
   16  * 1. Redistributions of source code must retain the above copyright
   17  *    notice, this list of conditions and the following disclaimer.
   18  * 2. Redistributions in binary form must reproduce the above copyright
   19  *    notice, this list of conditions and the following disclaimer in the
   20  *    documentation and/or other materials provided with the distribution.
   21  * 3. All advertising materials mentioning features or use of this software
   22  *    must display the following acknowledgement:
   23  *      This product includes software developed by the NetBSD
   24  *      Foundation, Inc. and its contributors.
   25  * 4. Neither the name of The NetBSD Foundation nor the names of its
   26  *    contributors may be used to endorse or promote products derived
   27  *    from this software without specific prior written permission.
   28  *
   29  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   30  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   31  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   32  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   33  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   34  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   35  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   36  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   37  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   38  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   39  * POSSIBILITY OF SUCH DAMAGE.
   40  */
   41 
   42 /*
   43  * Copyright (c) 1989, 1993
   44  *      The Regents of the University of California.  All rights reserved.
   45  * (c) UNIX System Laboratories, Inc.
   46  * All or some portions of this file are derived from material licensed
   47  * to the University of California by American Telephone and Telegraph
   48  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   49  * the permission of UNIX System Laboratories, Inc.
   50  *
   51  * Redistribution and use in source and binary forms, with or without
   52  * modification, are permitted provided that the following conditions
   53  * are met:
   54  * 1. Redistributions of source code must retain the above copyright
   55  *    notice, this list of conditions and the following disclaimer.
   56  * 2. Redistributions in binary form must reproduce the above copyright
   57  *    notice, this list of conditions and the following disclaimer in the
   58  *    documentation and/or other materials provided with the distribution.
   59  * 3. Neither the name of the University nor the names of its contributors
   60  *    may be used to endorse or promote products derived from this software
   61  *    without specific prior written permission.
   62  *
   63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   73  * SUCH DAMAGE.
   74  *
   75  *      @(#)vfs_subr.c  8.13 (Berkeley) 4/18/94
   76  */
   77 
   78 /*
   79  * External virtual filesystem routines
   80  */
   81 
   82 #include <sys/cdefs.h>
   83 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.276.2.3 2007/08/01 14:45:47 liamjfoy Exp $");
   84 
   85 #include "opt_inet.h"
   86 #include "opt_ddb.h"
   87 #include "opt_compat_netbsd.h"
   88 #include "opt_compat_43.h"
   89 
   90 #include <sys/param.h>
   91 #include <sys/systm.h>
   92 #include <sys/proc.h>
   93 #include <sys/kernel.h>
   94 #include <sys/mount.h>
   95 #include <sys/fcntl.h>
   96 #include <sys/vnode.h>
   97 #include <sys/stat.h>
   98 #include <sys/namei.h>
   99 #include <sys/ucred.h>
  100 #include <sys/buf.h>
  101 #include <sys/errno.h>
  102 #include <sys/malloc.h>
  103 #include <sys/domain.h>
  104 #include <sys/mbuf.h>
  105 #include <sys/sa.h>
  106 #include <sys/syscallargs.h>
  107 #include <sys/device.h>
  108 #include <sys/filedesc.h>
  109 #include <sys/kauth.h>
  110 
  111 #include <miscfs/specfs/specdev.h>
  112 #include <miscfs/genfs/genfs.h>
  113 #include <miscfs/syncfs/syncfs.h>
  114 
  115 #include <uvm/uvm.h>
  116 #include <uvm/uvm_readahead.h>
  117 #include <uvm/uvm_ddb.h>
  118 
  119 #include <sys/sysctl.h>
  120 
  121 const enum vtype iftovt_tab[16] = {
  122         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
  123         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
  124 };
  125 const int       vttoif_tab[9] = {
  126         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
  127         S_IFSOCK, S_IFIFO, S_IFMT,
  128 };
  129 
  130 int doforce = 1;                /* 1 => permit forcible unmounting */
  131 int prtactive = 0;              /* 1 => print out reclaim of active vnodes */
  132 
  133 extern int dovfsusermount;      /* 1 => permit any user to mount filesystems */
  134 extern int vfs_magiclinks;      /* 1 => expand "magic" symlinks */
  135 
  136 /*
  137  * Insq/Remq for the vnode usage lists.
  138  */
  139 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
  140 #define bufremvn(bp) {                                                  \
  141         LIST_REMOVE(bp, b_vnbufs);                                      \
  142         (bp)->b_vnbufs.le_next = NOLIST;                                \
  143 }
  144 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */
  145 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
  146 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
  147 
  148 struct mntlist mountlist =                      /* mounted filesystem list */
  149     CIRCLEQ_HEAD_INITIALIZER(mountlist);
  150 struct vfs_list_head vfs_list =                 /* vfs list */
  151     LIST_HEAD_INITIALIZER(vfs_list);
  152 
  153 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER;
  154 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER;
  155 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER;
  156 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER;
  157 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER;
  158 
  159 /* XXX - gross; single global lock to protect v_numoutput */
  160 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER;
  161 
  162 /*
  163  * These define the root filesystem and device.
  164  */
  165 struct mount *rootfs;
  166 struct vnode *rootvnode;
  167 struct device *root_device;                     /* root device */
  168 
  169 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl",
  170     &pool_allocator_nointr);
  171 
  172 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
  173 
  174 /*
  175  * Local declarations.
  176  */
  177 
  178 static specificdata_domain_t mount_specificdata_domain;
  179 
  180 static void insmntque(struct vnode *, struct mount *);
  181 static int getdevvp(dev_t, struct vnode **, enum vtype);
  182 static void vclean(struct vnode *, int, struct lwp *);
  183 static struct vnode *getcleanvnode(struct lwp *);
  184 
  185 #ifdef DEBUG
  186 void printlockedvnodes(void);
  187 #endif
  188 
  189 /*
  190  * Initialize the vnode management data structures.
  191  */
  192 void
  193 vntblinit(void)
  194 {
  195 
  196         mount_specificdata_domain = specificdata_domain_create();
  197 
  198         /*
  199          * Initialize the filesystem syncer.
  200          */
  201         vn_initialize_syncerd();
  202 }
  203 
  204 int
  205 vfs_drainvnodes(long target, struct lwp *l)
  206 {
  207 
  208         simple_lock(&vnode_free_list_slock);
  209         while (numvnodes > target) {
  210                 struct vnode *vp;
  211 
  212                 vp = getcleanvnode(l);
  213                 if (vp == NULL)
  214                         return EBUSY; /* give up */
  215                 pool_put(&vnode_pool, vp);
  216                 simple_lock(&vnode_free_list_slock);
  217                 numvnodes--;
  218         }
  219         simple_unlock(&vnode_free_list_slock);
  220 
  221         return 0;
  222 }
  223 
  224 /*
  225  * grab a vnode from freelist and clean it.
  226  */
  227 struct vnode *
  228 getcleanvnode(struct lwp *l)
  229 {
  230         struct vnode *vp;
  231         struct mount *mp;
  232         struct freelst *listhd;
  233 
  234         LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock));
  235 
  236         listhd = &vnode_free_list;
  237 try_nextlist:
  238         TAILQ_FOREACH(vp, listhd, v_freelist) {
  239                 if (!simple_lock_try(&vp->v_interlock))
  240                         continue;
  241                 /*
  242                  * as our lwp might hold the underlying vnode locked,
  243                  * don't try to reclaim the VLAYER vnode if it's locked.
  244                  */
  245                 if ((vp->v_flag & VXLOCK) == 0 &&
  246                     ((vp->v_flag & VLAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
  247                         if (vn_start_write(vp, &mp, V_NOWAIT) == 0)
  248                                 break;
  249                 }
  250                 mp = NULL;
  251                 simple_unlock(&vp->v_interlock);
  252         }
  253 
  254         if (vp == NULLVP) {
  255                 if (listhd == &vnode_free_list) {
  256                         listhd = &vnode_hold_list;
  257                         goto try_nextlist;
  258                 }
  259                 simple_unlock(&vnode_free_list_slock);
  260                 return NULLVP;
  261         }
  262 
  263         if (vp->v_usecount)
  264                 panic("free vnode isn't, vp %p", vp);
  265         TAILQ_REMOVE(listhd, vp, v_freelist);
  266         /* see comment on why 0xdeadb is set at end of vgone (below) */
  267         vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
  268         simple_unlock(&vnode_free_list_slock);
  269         vp->v_lease = NULL;
  270 
  271         if (vp->v_type != VBAD)
  272                 vgonel(vp, l);
  273         else
  274                 simple_unlock(&vp->v_interlock);
  275         vn_finished_write(mp, 0);
  276 #ifdef DIAGNOSTIC
  277         if (vp->v_data || vp->v_uobj.uo_npages ||
  278             TAILQ_FIRST(&vp->v_uobj.memq))
  279                 panic("cleaned vnode isn't, vp %p", vp);
  280         if (vp->v_numoutput)
  281                 panic("clean vnode has pending I/O's, vp %p", vp);
  282 #endif
  283         KASSERT((vp->v_flag & VONWORKLST) == 0);
  284 
  285         return vp;
  286 }
  287 
  288 /*
  289  * Mark a mount point as busy. Used to synchronize access and to delay
  290  * unmounting. Interlock is not released on failure.
  291  */
  292 int
  293 vfs_busy(struct mount *mp, int flags, struct simplelock *interlkp)
  294 {
  295         int lkflags;
  296 
  297         while (mp->mnt_iflag & IMNT_UNMOUNT) {
  298                 int gone, n;
  299 
  300                 if (flags & LK_NOWAIT)
  301                         return (ENOENT);
  302                 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL
  303                     && mp->mnt_unmounter == curlwp)
  304                         return (EDEADLK);
  305                 if (interlkp)
  306                         simple_unlock(interlkp);
  307                 /*
  308                  * Since all busy locks are shared except the exclusive
  309                  * lock granted when unmounting, the only place that a
  310                  * wakeup needs to be done is at the release of the
  311                  * exclusive lock at the end of dounmount.
  312                  */
  313                 simple_lock(&mp->mnt_slock);
  314                 mp->mnt_wcnt++;
  315                 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock);
  316                 n = --mp->mnt_wcnt;
  317                 simple_unlock(&mp->mnt_slock);
  318                 gone = mp->mnt_iflag & IMNT_GONE;
  319 
  320                 if (n == 0)
  321                         wakeup(&mp->mnt_wcnt);
  322                 if (interlkp)
  323                         simple_lock(interlkp);
  324                 if (gone)
  325                         return (ENOENT);
  326         }
  327         lkflags = LK_SHARED;
  328         if (interlkp)
  329                 lkflags |= LK_INTERLOCK;
  330         if (lockmgr(&mp->mnt_lock, lkflags, interlkp))
  331                 panic("vfs_busy: unexpected lock failure");
  332         return (0);
  333 }
  334 
  335 /*
  336  * Free a busy filesystem.
  337  */
  338 void
  339 vfs_unbusy(struct mount *mp)
  340 {
  341 
  342         lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
  343 }
  344 
  345 /*
  346  * Lookup a filesystem type, and if found allocate and initialize
  347  * a mount structure for it.
  348  *
  349  * Devname is usually updated by mount(8) after booting.
  350  */
  351 int
  352 vfs_rootmountalloc(const char *fstypename, const char *devname,
  353     struct mount **mpp)
  354 {
  355         struct vfsops *vfsp = NULL;
  356         struct mount *mp;
  357 
  358         LIST_FOREACH(vfsp, &vfs_list, vfs_list)
  359                 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN))
  360                         break;
  361 
  362         if (vfsp == NULL)
  363                 return (ENODEV);
  364         mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
  365         memset((char *)mp, 0, (u_long)sizeof(struct mount));
  366         lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
  367         simple_lock_init(&mp->mnt_slock);
  368         (void)vfs_busy(mp, LK_NOWAIT, 0);
  369         TAILQ_INIT(&mp->mnt_vnodelist);
  370         mp->mnt_op = vfsp;
  371         mp->mnt_flag = MNT_RDONLY;
  372         mp->mnt_vnodecovered = NULLVP;
  373         mp->mnt_leaf = mp;
  374         vfsp->vfs_refcount++;
  375         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN);
  376         mp->mnt_stat.f_mntonname[0] = '/';
  377         (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
  378         mount_initspecific(mp);
  379         *mpp = mp;
  380         return (0);
  381 }
  382 
  383 /*
  384  * Lookup a mount point by filesystem identifier.
  385  */
  386 struct mount *
  387 vfs_getvfs(fsid_t *fsid)
  388 {
  389         struct mount *mp;
  390 
  391         simple_lock(&mountlist_slock);
  392         CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
  393                 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
  394                     mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
  395                         simple_unlock(&mountlist_slock);
  396                         return (mp);
  397                 }
  398         }
  399         simple_unlock(&mountlist_slock);
  400         return ((struct mount *)0);
  401 }
  402 
  403 /*
  404  * Get a new unique fsid
  405  */
  406 void
  407 vfs_getnewfsid(struct mount *mp)
  408 {
  409         static u_short xxxfs_mntid;
  410         fsid_t tfsid;
  411         int mtype;
  412 
  413         simple_lock(&mntid_slock);
  414         mtype = makefstype(mp->mnt_op->vfs_name);
  415         mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
  416         mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
  417         mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
  418         if (xxxfs_mntid == 0)
  419                 ++xxxfs_mntid;
  420         tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
  421         tfsid.__fsid_val[1] = mtype;
  422         if (!CIRCLEQ_EMPTY(&mountlist)) {
  423                 while (vfs_getvfs(&tfsid)) {
  424                         tfsid.__fsid_val[0]++;
  425                         xxxfs_mntid++;
  426                 }
  427         }
  428         mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
  429         mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
  430         simple_unlock(&mntid_slock);
  431 }
  432 
  433 /*
  434  * Make a 'unique' number from a mount type name.
  435  */
  436 long
  437 makefstype(const char *type)
  438 {
  439         long rv;
  440 
  441         for (rv = 0; *type; type++) {
  442                 rv <<= 2;
  443                 rv ^= *type;
  444         }
  445         return rv;
  446 }
  447 
  448 
  449 /*
  450  * Set vnode attributes to VNOVAL
  451  */
  452 void
  453 vattr_null(struct vattr *vap)
  454 {
  455 
  456         vap->va_type = VNON;
  457 
  458         /*
  459          * Assign individually so that it is safe even if size and
  460          * sign of each member are varied.
  461          */
  462         vap->va_mode = VNOVAL;
  463         vap->va_nlink = VNOVAL;
  464         vap->va_uid = VNOVAL;
  465         vap->va_gid = VNOVAL;
  466         vap->va_fsid = VNOVAL;
  467         vap->va_fileid = VNOVAL;
  468         vap->va_size = VNOVAL;
  469         vap->va_blocksize = VNOVAL;
  470         vap->va_atime.tv_sec =
  471             vap->va_mtime.tv_sec =
  472             vap->va_ctime.tv_sec =
  473             vap->va_birthtime.tv_sec = VNOVAL;
  474         vap->va_atime.tv_nsec =
  475             vap->va_mtime.tv_nsec =
  476             vap->va_ctime.tv_nsec =
  477             vap->va_birthtime.tv_nsec = VNOVAL;
  478         vap->va_gen = VNOVAL;
  479         vap->va_flags = VNOVAL;
  480         vap->va_rdev = VNOVAL;
  481         vap->va_bytes = VNOVAL;
  482         vap->va_vaflags = 0;
  483 }
  484 
  485 /*
  486  * Routines having to do with the management of the vnode table.
  487  */
  488 extern int (**dead_vnodeop_p)(void *);
  489 long numvnodes;
  490 
  491 /*
  492  * Return the next vnode from the free list.
  493  */
  494 int
  495 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
  496     struct vnode **vpp)
  497 {
  498         extern struct uvm_pagerops uvm_vnodeops;
  499         struct uvm_object *uobj;
  500         struct lwp *l = curlwp;         /* XXX */
  501         static int toggle;
  502         struct vnode *vp;
  503         int error = 0, tryalloc;
  504 
  505  try_again:
  506         if (mp) {
  507                 /*
  508                  * Mark filesystem busy while we're creating a vnode.
  509                  * If unmount is in progress, this will wait; if the
  510                  * unmount succeeds (only if umount -f), this will
  511                  * return an error.  If the unmount fails, we'll keep
  512                  * going afterwards.
  513                  * (This puts the per-mount vnode list logically under
  514                  * the protection of the vfs_busy lock).
  515                  */
  516                 error = vfs_busy(mp, LK_RECURSEFAIL, 0);
  517                 if (error && error != EDEADLK)
  518                         return error;
  519         }
  520 
  521         /*
  522          * We must choose whether to allocate a new vnode or recycle an
  523          * existing one. The criterion for allocating a new one is that
  524          * the total number of vnodes is less than the number desired or
  525          * there are no vnodes on either free list. Generally we only
  526          * want to recycle vnodes that have no buffers associated with
  527          * them, so we look first on the vnode_free_list. If it is empty,
  528          * we next consider vnodes with referencing buffers on the
  529          * vnode_hold_list. The toggle ensures that half the time we
  530          * will use a buffer from the vnode_hold_list, and half the time
  531          * we will allocate a new one unless the list has grown to twice
  532          * the desired size. We are reticent to recycle vnodes from the
  533          * vnode_hold_list because we will lose the identity of all its
  534          * referencing buffers.
  535          */
  536 
  537         vp = NULL;
  538 
  539         simple_lock(&vnode_free_list_slock);
  540 
  541         toggle ^= 1;
  542         if (numvnodes > 2 * desiredvnodes)
  543                 toggle = 0;
  544 
  545         tryalloc = numvnodes < desiredvnodes ||
  546             (TAILQ_FIRST(&vnode_free_list) == NULL &&
  547              (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
  548 
  549         if (tryalloc &&
  550             (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) {
  551                 numvnodes++;
  552                 simple_unlock(&vnode_free_list_slock);
  553                 memset(vp, 0, sizeof(*vp));
  554                 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 1);
  555                 /*
  556                  * done by memset() above.
  557                  *      LIST_INIT(&vp->v_nclist);
  558                  *      LIST_INIT(&vp->v_dnclist);
  559                  */
  560         } else {
  561                 vp = getcleanvnode(l);
  562                 /*
  563                  * Unless this is a bad time of the month, at most
  564                  * the first NCPUS items on the free list are
  565                  * locked, so this is close enough to being empty.
  566                  */
  567                 if (vp == NULLVP) {
  568                         if (mp && error != EDEADLK)
  569                                 vfs_unbusy(mp);
  570                         if (tryalloc) {
  571                                 printf("WARNING: unable to allocate new "
  572                                     "vnode, retrying...\n");
  573                                 (void) tsleep(&lbolt, PRIBIO, "newvn", hz);
  574                                 goto try_again;
  575                         }
  576                         tablefull("vnode", "increase kern.maxvnodes or NVNODE");
  577                         *vpp = 0;
  578                         return (ENFILE);
  579                 }
  580                 vp->v_usecount = 1;
  581                 vp->v_flag = 0;
  582                 vp->v_socket = NULL;
  583         }
  584         vp->v_type = VNON;
  585         vp->v_vnlock = &vp->v_lock;
  586         lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
  587         KASSERT(LIST_EMPTY(&vp->v_nclist));
  588         KASSERT(LIST_EMPTY(&vp->v_dnclist));
  589         vp->v_tag = tag;
  590         vp->v_op = vops;
  591         insmntque(vp, mp);
  592         *vpp = vp;
  593         vp->v_data = 0;
  594         simple_lock_init(&vp->v_interlock);
  595 
  596         /*
  597          * initialize uvm_object within vnode.
  598          */
  599 
  600         uobj = &vp->v_uobj;
  601         KASSERT(uobj->pgops == &uvm_vnodeops);
  602         KASSERT(uobj->uo_npages == 0);
  603         KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
  604         vp->v_size = VSIZENOTSET;
  605 
  606         if (mp && error != EDEADLK)
  607                 vfs_unbusy(mp);
  608         return (0);
  609 }
  610 
  611 /*
  612  * This is really just the reverse of getnewvnode(). Needed for
  613  * VFS_VGET functions who may need to push back a vnode in case
  614  * of a locking race.
  615  */
  616 void
  617 ungetnewvnode(struct vnode *vp)
  618 {
  619 #ifdef DIAGNOSTIC
  620         if (vp->v_usecount != 1)
  621                 panic("ungetnewvnode: busy vnode");
  622 #endif
  623         vp->v_usecount--;
  624         insmntque(vp, NULL);
  625         vp->v_type = VBAD;
  626 
  627         simple_lock(&vp->v_interlock);
  628         /*
  629          * Insert at head of LRU list
  630          */
  631         simple_lock(&vnode_free_list_slock);
  632         if (vp->v_holdcnt > 0)
  633                 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist);
  634         else
  635                 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
  636         simple_unlock(&vnode_free_list_slock);
  637         simple_unlock(&vp->v_interlock);
  638 }
  639 
  640 /*
  641  * Move a vnode from one mount queue to another.
  642  */
  643 static void
  644 insmntque(struct vnode *vp, struct mount *mp)
  645 {
  646 
  647 #ifdef DIAGNOSTIC
  648         if ((mp != NULL) &&
  649             (mp->mnt_iflag & IMNT_UNMOUNT) &&
  650             !(mp->mnt_flag & MNT_SOFTDEP) &&
  651             vp->v_tag != VT_VFS) {
  652                 panic("insmntque into dying filesystem");
  653         }
  654 #endif
  655 
  656         simple_lock(&mntvnode_slock);
  657         /*
  658          * Delete from old mount point vnode list, if on one.
  659          */
  660         if (vp->v_mount != NULL)
  661                 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
  662         /*
  663          * Insert into list of vnodes for the new mount point, if available.
  664          */
  665         if ((vp->v_mount = mp) != NULL) {
  666                 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
  667                         TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
  668                 } else {
  669                         TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
  670                 }
  671         }
  672         simple_unlock(&mntvnode_slock);
  673 }
  674 
  675 /*
  676  * Update outstanding I/O count and do wakeup if requested.
  677  */
  678 void
  679 vwakeup(struct buf *bp)
  680 {
  681         struct vnode *vp;
  682 
  683         if ((vp = bp->b_vp) != NULL) {
  684                 /* XXX global lock hack
  685                  * can't use v_interlock here since this is called
  686                  * in interrupt context from biodone().
  687                  */
  688                 simple_lock(&global_v_numoutput_slock);
  689                 if (--vp->v_numoutput < 0)
  690                         panic("vwakeup: neg numoutput, vp %p", vp);
  691                 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
  692                         vp->v_flag &= ~VBWAIT;
  693                         wakeup((caddr_t)&vp->v_numoutput);
  694                 }
  695                 simple_unlock(&global_v_numoutput_slock);
  696         }
  697 }
  698 
  699 /*
  700  * Flush out and invalidate all buffers associated with a vnode.
  701  * Called with the underlying vnode locked, which should prevent new dirty
  702  * buffers from being queued.
  703  */
  704 int
  705 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
  706     int slpflag, int slptimeo)
  707 {
  708         struct buf *bp, *nbp;
  709         int s, error;
  710         int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
  711                 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
  712 
  713         /* XXXUBC this doesn't look at flags or slp* */
  714         simple_lock(&vp->v_interlock);
  715         error = VOP_PUTPAGES(vp, 0, 0, flushflags);
  716         if (error) {
  717                 return error;
  718         }
  719 
  720         if (flags & V_SAVE) {
  721                 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, l);
  722                 if (error)
  723                         return (error);
  724 #ifdef DIAGNOSTIC
  725                 s = splbio();
  726                 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd))
  727                         panic("vinvalbuf: dirty bufs, vp %p", vp);
  728                 splx(s);
  729 #endif
  730         }
  731 
  732         s = splbio();
  733 
  734 restart:
  735         for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
  736                 nbp = LIST_NEXT(bp, b_vnbufs);
  737                 simple_lock(&bp->b_interlock);
  738                 if (bp->b_flags & B_BUSY) {
  739                         bp->b_flags |= B_WANTED;
  740                         error = ltsleep((caddr_t)bp,
  741                                     slpflag | (PRIBIO + 1) | PNORELOCK,
  742                                     "vinvalbuf", slptimeo, &bp->b_interlock);
  743                         if (error) {
  744                                 splx(s);
  745                                 return (error);
  746                         }
  747                         goto restart;
  748                 }
  749                 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
  750                 simple_unlock(&bp->b_interlock);
  751                 brelse(bp);
  752         }
  753 
  754         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
  755                 nbp = LIST_NEXT(bp, b_vnbufs);
  756                 simple_lock(&bp->b_interlock);
  757                 if (bp->b_flags & B_BUSY) {
  758                         bp->b_flags |= B_WANTED;
  759                         error = ltsleep((caddr_t)bp,
  760                                     slpflag | (PRIBIO + 1) | PNORELOCK,
  761                                     "vinvalbuf", slptimeo, &bp->b_interlock);
  762                         if (error) {
  763                                 splx(s);
  764                                 return (error);
  765                         }
  766                         goto restart;
  767                 }
  768                 /*
  769                  * XXX Since there are no node locks for NFS, I believe
  770                  * there is a slight chance that a delayed write will
  771                  * occur while sleeping just above, so check for it.
  772                  */
  773                 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
  774 #ifdef DEBUG
  775                         printf("buffer still DELWRI\n");
  776 #endif
  777                         bp->b_flags |= B_BUSY | B_VFLUSH;
  778                         simple_unlock(&bp->b_interlock);
  779                         VOP_BWRITE(bp);
  780                         goto restart;
  781                 }
  782                 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
  783                 simple_unlock(&bp->b_interlock);
  784                 brelse(bp);
  785         }
  786 
  787 #ifdef DIAGNOSTIC
  788         if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
  789                 panic("vinvalbuf: flush failed, vp %p", vp);
  790 #endif
  791 
  792         splx(s);
  793 
  794         return (0);
  795 }
  796 
  797 /*
  798  * Destroy any in core blocks past the truncation length.
  799  * Called with the underlying vnode locked, which should prevent new dirty
  800  * buffers from being queued.
  801  */
  802 int
  803 vtruncbuf(struct vnode *vp, daddr_t lbn, int slpflag, int slptimeo)
  804 {
  805         struct buf *bp, *nbp;
  806         int s, error;
  807         voff_t off;
  808 
  809         off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
  810         simple_lock(&vp->v_interlock);
  811         error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
  812         if (error) {
  813                 return error;
  814         }
  815 
  816         s = splbio();
  817 
  818 restart:
  819         for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
  820                 nbp = LIST_NEXT(bp, b_vnbufs);
  821                 if (bp->b_lblkno < lbn)
  822                         continue;
  823                 simple_lock(&bp->b_interlock);
  824                 if (bp->b_flags & B_BUSY) {
  825                         bp->b_flags |= B_WANTED;
  826                         error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
  827                             "vtruncbuf", slptimeo, &bp->b_interlock);
  828                         if (error) {
  829                                 splx(s);
  830                                 return (error);
  831                         }
  832                         goto restart;
  833                 }
  834                 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
  835                 simple_unlock(&bp->b_interlock);
  836                 brelse(bp);
  837         }
  838 
  839         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
  840                 nbp = LIST_NEXT(bp, b_vnbufs);
  841                 if (bp->b_lblkno < lbn)
  842                         continue;
  843                 simple_lock(&bp->b_interlock);
  844                 if (bp->b_flags & B_BUSY) {
  845                         bp->b_flags |= B_WANTED;
  846                         error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
  847                             "vtruncbuf", slptimeo, &bp->b_interlock);
  848                         if (error) {
  849                                 splx(s);
  850                                 return (error);
  851                         }
  852                         goto restart;
  853                 }
  854                 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
  855                 simple_unlock(&bp->b_interlock);
  856                 brelse(bp);
  857         }
  858 
  859         splx(s);
  860 
  861         return (0);
  862 }
  863 
  864 void
  865 vflushbuf(struct vnode *vp, int sync)
  866 {
  867         struct buf *bp, *nbp;
  868         int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
  869         int s;
  870 
  871         simple_lock(&vp->v_interlock);
  872         (void) VOP_PUTPAGES(vp, 0, 0, flags);
  873 
  874 loop:
  875         s = splbio();
  876         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
  877                 nbp = LIST_NEXT(bp, b_vnbufs);
  878                 simple_lock(&bp->b_interlock);
  879                 if ((bp->b_flags & B_BUSY)) {
  880                         simple_unlock(&bp->b_interlock);
  881                         continue;
  882                 }
  883                 if ((bp->b_flags & B_DELWRI) == 0)
  884                         panic("vflushbuf: not dirty, bp %p", bp);
  885                 bp->b_flags |= B_BUSY | B_VFLUSH;
  886                 simple_unlock(&bp->b_interlock);
  887                 splx(s);
  888                 /*
  889                  * Wait for I/O associated with indirect blocks to complete,
  890                  * since there is no way to quickly wait for them below.
  891                  */
  892                 if (bp->b_vp == vp || sync == 0)
  893                         (void) bawrite(bp);
  894                 else
  895                         (void) bwrite(bp);
  896                 goto loop;
  897         }
  898         if (sync == 0) {
  899                 splx(s);
  900                 return;
  901         }
  902         simple_lock(&global_v_numoutput_slock);
  903         while (vp->v_numoutput) {
  904                 vp->v_flag |= VBWAIT;
  905                 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0,
  906                         &global_v_numoutput_slock);
  907         }
  908         simple_unlock(&global_v_numoutput_slock);
  909         splx(s);
  910         if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
  911                 vprint("vflushbuf: dirty", vp);
  912                 goto loop;
  913         }
  914 }
  915 
  916 /*
  917  * Associate a buffer with a vnode.
  918  */
  919 void
  920 bgetvp(struct vnode *vp, struct buf *bp)
  921 {
  922         int s;
  923 
  924         if (bp->b_vp)
  925                 panic("bgetvp: not free, bp %p", bp);
  926         VHOLD(vp);
  927         s = splbio();
  928         bp->b_vp = vp;
  929         if (vp->v_type == VBLK || vp->v_type == VCHR)
  930                 bp->b_dev = vp->v_rdev;
  931         else
  932                 bp->b_dev = NODEV;
  933         /*
  934          * Insert onto list for new vnode.
  935          */
  936         bufinsvn(bp, &vp->v_cleanblkhd);
  937         splx(s);
  938 }
  939 
  940 /*
  941  * Disassociate a buffer from a vnode.
  942  */
  943 void
  944 brelvp(struct buf *bp)
  945 {
  946         struct vnode *vp;
  947         int s;
  948 
  949         if (bp->b_vp == NULL)
  950                 panic("brelvp: vp NULL, bp %p", bp);
  951 
  952         s = splbio();
  953         vp = bp->b_vp;
  954         /*
  955          * Delete from old vnode list, if on one.
  956          */
  957         if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
  958                 bufremvn(bp);
  959 
  960         if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) &&
  961             LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
  962                 vp->v_flag &= ~VWRITEMAPDIRTY;
  963                 vn_syncer_remove_from_worklist(vp);
  964         }
  965 
  966         bp->b_vp = NULL;
  967         HOLDRELE(vp);
  968         splx(s);
  969 }
  970 
  971 /*
  972  * Reassign a buffer from one vnode to another.
  973  * Used to assign file specific control information
  974  * (indirect blocks) to the vnode to which they belong.
  975  *
  976  * This function must be called at splbio().
  977  */
  978 void
  979 reassignbuf(struct buf *bp, struct vnode *newvp)
  980 {
  981         struct buflists *listheadp;
  982         int delayx;
  983 
  984         /*
  985          * Delete from old vnode list, if on one.
  986          */
  987         if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
  988                 bufremvn(bp);
  989         /*
  990          * If dirty, put on list of dirty buffers;
  991          * otherwise insert onto list of clean buffers.
  992          */
  993         if ((bp->b_flags & B_DELWRI) == 0) {
  994                 listheadp = &newvp->v_cleanblkhd;
  995                 if (TAILQ_EMPTY(&newvp->v_uobj.memq) &&
  996                     (newvp->v_flag & VONWORKLST) &&
  997                     LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
  998                         newvp->v_flag &= ~VWRITEMAPDIRTY;
  999                         vn_syncer_remove_from_worklist(newvp);
 1000                 }
 1001         } else {
 1002                 listheadp = &newvp->v_dirtyblkhd;
 1003                 if ((newvp->v_flag & VONWORKLST) == 0) {
 1004                         switch (newvp->v_type) {
 1005                         case VDIR:
 1006                                 delayx = dirdelay;
 1007                                 break;
 1008                         case VBLK:
 1009                                 if (newvp->v_specmountpoint != NULL) {
 1010                                         delayx = metadelay;
 1011                                         break;
 1012                                 }
 1013                                 /* fall through */
 1014                         default:
 1015                                 delayx = filedelay;
 1016                                 break;
 1017                         }
 1018                         if (!newvp->v_mount ||
 1019                             (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0)
 1020                                 vn_syncer_add_to_worklist(newvp, delayx);
 1021                 }
 1022         }
 1023         bufinsvn(bp, listheadp);
 1024 }
 1025 
 1026 /*
 1027  * Create a vnode for a block device.
 1028  * Used for root filesystem and swap areas.
 1029  * Also used for memory file system special devices.
 1030  */
 1031 int
 1032 bdevvp(dev_t dev, struct vnode **vpp)
 1033 {
 1034 
 1035         return (getdevvp(dev, vpp, VBLK));
 1036 }
 1037 
 1038 /*
 1039  * Create a vnode for a character device.
 1040  * Used for kernfs and some console handling.
 1041  */
 1042 int
 1043 cdevvp(dev_t dev, struct vnode **vpp)
 1044 {
 1045 
 1046         return (getdevvp(dev, vpp, VCHR));
 1047 }
 1048 
 1049 /*
 1050  * Create a vnode for a device.
 1051  * Used by bdevvp (block device) for root file system etc.,
 1052  * and by cdevvp (character device) for console and kernfs.
 1053  */
 1054 static int
 1055 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type)
 1056 {
 1057         struct vnode *vp;
 1058         struct vnode *nvp;
 1059         int error;
 1060 
 1061         if (dev == NODEV) {
 1062                 *vpp = NULLVP;
 1063                 return (0);
 1064         }
 1065         error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
 1066         if (error) {
 1067                 *vpp = NULLVP;
 1068                 return (error);
 1069         }
 1070         vp = nvp;
 1071         vp->v_type = type;
 1072         if ((nvp = checkalias(vp, dev, NULL)) != 0) {
 1073                 vput(vp);
 1074                 vp = nvp;
 1075         }
 1076         *vpp = vp;
 1077         return (0);
 1078 }
 1079 
 1080 /*
 1081  * Check to see if the new vnode represents a special device
 1082  * for which we already have a vnode (either because of
 1083  * bdevvp() or because of a different vnode representing
 1084  * the same block device). If such an alias exists, deallocate
 1085  * the existing contents and return the aliased vnode. The
 1086  * caller is responsible for filling it with its new contents.
 1087  */
 1088 struct vnode *
 1089 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp)
 1090 {
 1091         struct lwp *l = curlwp;         /* XXX */
 1092         struct vnode *vp;
 1093         struct vnode **vpp;
 1094 
 1095         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 1096                 return (NULLVP);
 1097 
 1098         vpp = &speclisth[SPECHASH(nvp_rdev)];
 1099 loop:
 1100         simple_lock(&spechash_slock);
 1101         for (vp = *vpp; vp; vp = vp->v_specnext) {
 1102                 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
 1103                         continue;
 1104                 /*
 1105                  * Alias, but not in use, so flush it out.
 1106                  */
 1107                 simple_lock(&vp->v_interlock);
 1108                 simple_unlock(&spechash_slock);
 1109                 if (vp->v_usecount == 0) {
 1110                         vgonel(vp, l);
 1111                         goto loop;
 1112                 }
 1113                 /*
 1114                  * What we're interested to know here is if someone else has
 1115                  * removed this vnode from the device hash list while we were
 1116                  * waiting.  This can only happen if vclean() did it, and
 1117                  * this requires the vnode to be locked.
 1118                  */
 1119                 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK))
 1120                         goto loop;
 1121                 if (vp->v_specinfo == NULL) {
 1122                         vput(vp);
 1123                         goto loop;
 1124                 }
 1125                 simple_lock(&spechash_slock);
 1126                 break;
 1127         }
 1128         if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
 1129                 MALLOC(nvp->v_specinfo, struct specinfo *,
 1130                         sizeof(struct specinfo), M_VNODE, M_NOWAIT);
 1131                 /* XXX Erg. */
 1132                 if (nvp->v_specinfo == NULL) {
 1133                         simple_unlock(&spechash_slock);
 1134                         uvm_wait("checkalias");
 1135                         goto loop;
 1136                 }
 1137 
 1138                 nvp->v_rdev = nvp_rdev;
 1139                 nvp->v_hashchain = vpp;
 1140                 nvp->v_specnext = *vpp;
 1141                 nvp->v_specmountpoint = NULL;
 1142                 simple_unlock(&spechash_slock);
 1143                 nvp->v_speclockf = NULL;
 1144                 simple_lock_init(&nvp->v_spec_cow_slock);
 1145                 SLIST_INIT(&nvp->v_spec_cow_head);
 1146                 nvp->v_spec_cow_req = 0;
 1147                 nvp->v_spec_cow_count = 0;
 1148 
 1149                 *vpp = nvp;
 1150                 if (vp != NULLVP) {
 1151                         nvp->v_flag |= VALIASED;
 1152                         vp->v_flag |= VALIASED;
 1153                         vput(vp);
 1154                 }
 1155                 return (NULLVP);
 1156         }
 1157         simple_unlock(&spechash_slock);
 1158         VOP_UNLOCK(vp, 0);
 1159         simple_lock(&vp->v_interlock);
 1160         vclean(vp, 0, l);
 1161         vp->v_op = nvp->v_op;
 1162         vp->v_tag = nvp->v_tag;
 1163         vp->v_vnlock = &vp->v_lock;
 1164         lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
 1165         nvp->v_type = VNON;
 1166         insmntque(vp, mp);
 1167         return (vp);
 1168 }
 1169 
 1170 /*
 1171  * Grab a particular vnode from the free list, increment its
 1172  * reference count and lock it. If the vnode lock bit is set the
 1173  * vnode is being eliminated in vgone. In that case, we can not
 1174  * grab the vnode, so the process is awakened when the transition is
 1175  * completed, and an error returned to indicate that the vnode is no
 1176  * longer usable (possibly having been changed to a new file system type).
 1177  */
 1178 int
 1179 vget(struct vnode *vp, int flags)
 1180 {
 1181         int error;
 1182 
 1183         /*
 1184          * If the vnode is in the process of being cleaned out for
 1185          * another use, we wait for the cleaning to finish and then
 1186          * return failure. Cleaning is determined by checking that
 1187          * the VXLOCK flag is set.
 1188          */
 1189 
 1190         if ((flags & LK_INTERLOCK) == 0)
 1191                 simple_lock(&vp->v_interlock);
 1192         if ((vp->v_flag & (VXLOCK | VFREEING)) != 0) {
 1193                 if (flags & LK_NOWAIT) {
 1194                         simple_unlock(&vp->v_interlock);
 1195                         return EBUSY;
 1196                 }
 1197                 vp->v_flag |= VXWANT;
 1198                 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock);
 1199                 return (ENOENT);
 1200         }
 1201         if (vp->v_usecount == 0) {
 1202                 simple_lock(&vnode_free_list_slock);
 1203                 if (vp->v_holdcnt > 0)
 1204                         TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
 1205                 else
 1206                         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 1207                 simple_unlock(&vnode_free_list_slock);
 1208         }
 1209         vp->v_usecount++;
 1210 #ifdef DIAGNOSTIC
 1211         if (vp->v_usecount == 0) {
 1212                 vprint("vget", vp);
 1213                 panic("vget: usecount overflow, vp %p", vp);
 1214         }
 1215 #endif
 1216         if (flags & LK_TYPE_MASK) {
 1217                 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) {
 1218                         vrele(vp);
 1219                 }
 1220                 return (error);
 1221         }
 1222         simple_unlock(&vp->v_interlock);
 1223         return (0);
 1224 }
 1225 
 1226 /*
 1227  * vput(), just unlock and vrele()
 1228  */
 1229 void
 1230 vput(struct vnode *vp)
 1231 {
 1232         struct lwp *l = curlwp;         /* XXX */
 1233 
 1234 #ifdef DIAGNOSTIC
 1235         if (vp == NULL)
 1236                 panic("vput: null vp");
 1237 #endif
 1238         simple_lock(&vp->v_interlock);
 1239         vp->v_usecount--;
 1240         if (vp->v_usecount > 0) {
 1241                 simple_unlock(&vp->v_interlock);
 1242                 VOP_UNLOCK(vp, 0);
 1243                 return;
 1244         }
 1245 #ifdef DIAGNOSTIC
 1246         if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 1247                 vprint("vput: bad ref count", vp);
 1248                 panic("vput: ref cnt");
 1249         }
 1250 #endif
 1251         /*
 1252          * Insert at tail of LRU list.
 1253          */
 1254         simple_lock(&vnode_free_list_slock);
 1255         if (vp->v_holdcnt > 0)
 1256                 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
 1257         else
 1258                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 1259         simple_unlock(&vnode_free_list_slock);
 1260         if (vp->v_flag & VEXECMAP) {
 1261                 uvmexp.execpages -= vp->v_uobj.uo_npages;
 1262                 uvmexp.filepages += vp->v_uobj.uo_npages;
 1263         }
 1264         vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP|VMAPPED);
 1265         simple_unlock(&vp->v_interlock);
 1266         VOP_INACTIVE(vp, l);
 1267 }
 1268 
 1269 /*
 1270  * Vnode release.
 1271  * If count drops to zero, call inactive routine and return to freelist.
 1272  */
 1273 void
 1274 vrele(struct vnode *vp)
 1275 {
 1276         struct lwp *l = curlwp;         /* XXX */
 1277 
 1278 #ifdef DIAGNOSTIC
 1279         if (vp == NULL)
 1280                 panic("vrele: null vp");
 1281 #endif
 1282         simple_lock(&vp->v_interlock);
 1283         vp->v_usecount--;
 1284         if (vp->v_usecount > 0) {
 1285                 simple_unlock(&vp->v_interlock);
 1286                 return;
 1287         }
 1288 #ifdef DIAGNOSTIC
 1289         if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 1290                 vprint("vrele: bad ref count", vp);
 1291                 panic("vrele: ref cnt vp %p", vp);
 1292         }
 1293 #endif
 1294         /*
 1295          * Insert at tail of LRU list.
 1296          */
 1297         simple_lock(&vnode_free_list_slock);
 1298         if (vp->v_holdcnt > 0)
 1299                 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
 1300         else
 1301                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 1302         simple_unlock(&vnode_free_list_slock);
 1303         if (vp->v_flag & VEXECMAP) {
 1304                 uvmexp.execpages -= vp->v_uobj.uo_npages;
 1305                 uvmexp.filepages += vp->v_uobj.uo_npages;
 1306         }
 1307         vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP|VMAPPED);
 1308         if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
 1309                 VOP_INACTIVE(vp, l);
 1310 }
 1311 
 1312 /*
 1313  * Page or buffer structure gets a reference.
 1314  * Called with v_interlock held.
 1315  */
 1316 void
 1317 vholdl(struct vnode *vp)
 1318 {
 1319 
 1320         /*
 1321          * If it is on the freelist and the hold count is currently
 1322          * zero, move it to the hold list. The test of the back
 1323          * pointer and the use reference count of zero is because
 1324          * it will be removed from a free list by getnewvnode,
 1325          * but will not have its reference count incremented until
 1326          * after calling vgone. If the reference count were
 1327          * incremented first, vgone would (incorrectly) try to
 1328          * close the previous instance of the underlying object.
 1329          * So, the back pointer is explicitly set to `0xdeadb' in
 1330          * getnewvnode after removing it from a freelist to ensure
 1331          * that we do not try to move it here.
 1332          */
 1333         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
 1334             vp->v_holdcnt == 0 && vp->v_usecount == 0) {
 1335                 simple_lock(&vnode_free_list_slock);
 1336                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 1337                 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
 1338                 simple_unlock(&vnode_free_list_slock);
 1339         }
 1340         vp->v_holdcnt++;
 1341 }
 1342 
 1343 /*
 1344  * Page or buffer structure frees a reference.
 1345  * Called with v_interlock held.
 1346  */
 1347 void
 1348 holdrelel(struct vnode *vp)
 1349 {
 1350 
 1351         if (vp->v_holdcnt <= 0)
 1352                 panic("holdrelel: holdcnt vp %p", vp);
 1353         vp->v_holdcnt--;
 1354 
 1355         /*
 1356          * If it is on the holdlist and the hold count drops to
 1357          * zero, move it to the free list. The test of the back
 1358          * pointer and the use reference count of zero is because
 1359          * it will be removed from a free list by getnewvnode,
 1360          * but will not have its reference count incremented until
 1361          * after calling vgone. If the reference count were
 1362          * incremented first, vgone would (incorrectly) try to
 1363          * close the previous instance of the underlying object.
 1364          * So, the back pointer is explicitly set to `0xdeadb' in
 1365          * getnewvnode after removing it from a freelist to ensure
 1366          * that we do not try to move it here.
 1367          */
 1368 
 1369         if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
 1370             vp->v_holdcnt == 0 && vp->v_usecount == 0) {
 1371                 simple_lock(&vnode_free_list_slock);
 1372                 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
 1373                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 1374                 simple_unlock(&vnode_free_list_slock);
 1375         }
 1376 }
 1377 
 1378 /*
 1379  * Vnode reference.
 1380  */
 1381 void
 1382 vref(struct vnode *vp)
 1383 {
 1384 
 1385         simple_lock(&vp->v_interlock);
 1386         if (vp->v_usecount <= 0)
 1387                 panic("vref used where vget required, vp %p", vp);
 1388         vp->v_usecount++;
 1389 #ifdef DIAGNOSTIC
 1390         if (vp->v_usecount == 0) {
 1391                 vprint("vref", vp);
 1392                 panic("vref: usecount overflow, vp %p", vp);
 1393         }
 1394 #endif
 1395         simple_unlock(&vp->v_interlock);
 1396 }
 1397 
 1398 /*
 1399  * Remove any vnodes in the vnode table belonging to mount point mp.
 1400  *
 1401  * If FORCECLOSE is not specified, there should not be any active ones,
 1402  * return error if any are found (nb: this is a user error, not a
 1403  * system error). If FORCECLOSE is specified, detach any active vnodes
 1404  * that are found.
 1405  *
 1406  * If WRITECLOSE is set, only flush out regular file vnodes open for
 1407  * writing.
 1408  *
 1409  * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
 1410  */
 1411 #ifdef DEBUG
 1412 int busyprt = 0;        /* print out busy vnodes */
 1413 struct ctldebug debug1 = { "busyprt", &busyprt };
 1414 #endif
 1415 
 1416 int
 1417 vflush(struct mount *mp, struct vnode *skipvp, int flags)
 1418 {
 1419         struct lwp *l = curlwp;         /* XXX */
 1420         struct vnode *vp, *nvp;
 1421         int busy = 0;
 1422 
 1423         simple_lock(&mntvnode_slock);
 1424 loop:
 1425         /*
 1426          * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
 1427          * and vclean() are called
 1428          */
 1429         for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
 1430                 if (vp->v_mount != mp)
 1431                         goto loop;
 1432                 nvp = TAILQ_NEXT(vp, v_mntvnodes);
 1433                 /*
 1434                  * Skip over a selected vnode.
 1435                  */
 1436                 if (vp == skipvp)
 1437                         continue;
 1438                 simple_lock(&vp->v_interlock);
 1439                 /*
 1440                  * Skip over a vnodes marked VSYSTEM.
 1441                  */
 1442                 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
 1443                         simple_unlock(&vp->v_interlock);
 1444                         continue;
 1445                 }
 1446                 /*
 1447                  * If WRITECLOSE is set, only flush out regular file
 1448                  * vnodes open for writing.
 1449                  */
 1450                 if ((flags & WRITECLOSE) &&
 1451                     (vp->v_writecount == 0 || vp->v_type != VREG)) {
 1452                         simple_unlock(&vp->v_interlock);
 1453                         continue;
 1454                 }
 1455                 /*
 1456                  * With v_usecount == 0, all we need to do is clear
 1457                  * out the vnode data structures and we are done.
 1458                  */
 1459                 if (vp->v_usecount == 0) {
 1460                         simple_unlock(&mntvnode_slock);
 1461                         vgonel(vp, l);
 1462                         simple_lock(&mntvnode_slock);
 1463                         continue;
 1464                 }
 1465                 /*
 1466                  * If FORCECLOSE is set, forcibly close the vnode.
 1467                  * For block or character devices, revert to an
 1468                  * anonymous device. For all other files, just kill them.
 1469                  */
 1470                 if (flags & FORCECLOSE) {
 1471                         simple_unlock(&mntvnode_slock);
 1472                         if (vp->v_type != VBLK && vp->v_type != VCHR) {
 1473                                 vgonel(vp, l);
 1474                         } else {
 1475                                 vclean(vp, 0, l);
 1476                                 vp->v_op = spec_vnodeop_p;
 1477                                 insmntque(vp, (struct mount *)0);
 1478                         }
 1479                         simple_lock(&mntvnode_slock);
 1480                         continue;
 1481                 }
 1482 #ifdef DEBUG
 1483                 if (busyprt)
 1484                         vprint("vflush: busy vnode", vp);
 1485 #endif
 1486                 simple_unlock(&vp->v_interlock);
 1487                 busy++;
 1488         }
 1489         simple_unlock(&mntvnode_slock);
 1490         if (busy)
 1491                 return (EBUSY);
 1492         return (0);
 1493 }
 1494 
 1495 /*
 1496  * Disassociate the underlying file system from a vnode.
 1497  */
 1498 static void
 1499 vclean(struct vnode *vp, int flags, struct lwp *l)
 1500 {
 1501         struct mount *mp;
 1502         int active;
 1503 
 1504         LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
 1505 
 1506         /*
 1507          * Check to see if the vnode is in use.
 1508          * If so we have to reference it before we clean it out
 1509          * so that its count cannot fall to zero and generate a
 1510          * race against ourselves to recycle it.
 1511          */
 1512 
 1513         if ((active = vp->v_usecount) != 0) {
 1514                 vp->v_usecount++;
 1515 #ifdef DIAGNOSTIC
 1516                 if (vp->v_usecount == 0) {
 1517                         vprint("vclean", vp);
 1518                         panic("vclean: usecount overflow");
 1519                 }
 1520 #endif
 1521         }
 1522 
 1523         /*
 1524          * Prevent the vnode from being recycled or
 1525          * brought into use while we clean it out.
 1526          */
 1527         if (vp->v_flag & VXLOCK)
 1528                 panic("vclean: deadlock, vp %p", vp);
 1529         vp->v_flag |= VXLOCK;
 1530         if (vp->v_flag & VEXECMAP) {
 1531                 uvmexp.execpages -= vp->v_uobj.uo_npages;
 1532                 uvmexp.filepages += vp->v_uobj.uo_npages;
 1533         }
 1534         vp->v_flag &= ~(VTEXT|VEXECMAP);
 1535 
 1536         /*
 1537          * Even if the count is zero, the VOP_INACTIVE routine may still
 1538          * have the object locked while it cleans it out.  For
 1539          * active vnodes, it ensures that no other activity can
 1540          * occur while the underlying object is being cleaned out.
 1541          *
 1542          * We drain the lock to make sure we are the last one trying to
 1543          * get it and immediately resurrect the lock.  Future accesses
 1544          * for locking this _vnode_ will be protected by VXLOCK.  However,
 1545          * upper layers might be using the _lock_ in case the file system
 1546          * exported it and might access it while the vnode lingers in
 1547          * deadfs.
 1548          */
 1549         VOP_LOCK(vp, LK_DRAIN | LK_RESURRECT | LK_INTERLOCK);
 1550 
 1551         /*
 1552          * Clean out any cached data associated with the vnode.
 1553          * If special device, remove it from special device alias list.
 1554          * if it is on one.
 1555          */
 1556         if (flags & DOCLOSE) {
 1557                 int error;
 1558                 struct vnode *vq, *vx;
 1559 
 1560                 vn_start_write(vp, &mp, V_WAIT | V_LOWER);
 1561                 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
 1562                 vn_finished_write(mp, V_LOWER);
 1563                 if (error)
 1564                         error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
 1565                 KASSERT(error == 0);
 1566                 KASSERT((vp->v_flag & VONWORKLST) == 0);
 1567 
 1568                 if (active)
 1569                         VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
 1570 
 1571                 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
 1572                     vp->v_specinfo != 0) {
 1573                         simple_lock(&spechash_slock);
 1574                         if (vp->v_hashchain != NULL) {
 1575                                 if (*vp->v_hashchain == vp) {
 1576                                         *vp->v_hashchain = vp->v_specnext;
 1577                                 } else {
 1578                                         for (vq = *vp->v_hashchain; vq;
 1579                                              vq = vq->v_specnext) {
 1580                                                 if (vq->v_specnext != vp)
 1581                                                         continue;
 1582                                                 vq->v_specnext = vp->v_specnext;
 1583                                                 break;
 1584                                         }
 1585                                         if (vq == NULL)
 1586                                                 panic("missing bdev");
 1587                                 }
 1588                                 if (vp->v_flag & VALIASED) {
 1589                                         vx = NULL;
 1590                                                 for (vq = *vp->v_hashchain; vq;
 1591                                                      vq = vq->v_specnext) {
 1592                                                 if (vq->v_rdev != vp->v_rdev ||
 1593                                                     vq->v_type != vp->v_type)
 1594                                                         continue;
 1595                                                 if (vx)
 1596                                                         break;
 1597                                                 vx = vq;
 1598                                         }
 1599                                         if (vx == NULL)
 1600                                                 panic("missing alias");
 1601                                         if (vq == NULL)
 1602                                                 vx->v_flag &= ~VALIASED;
 1603                                         vp->v_flag &= ~VALIASED;
 1604                                 }
 1605                         }
 1606                         simple_unlock(&spechash_slock);
 1607                         FREE(vp->v_specinfo, M_VNODE);
 1608                         vp->v_specinfo = NULL;
 1609                 }
 1610         }
 1611         LOCK_ASSERT(!simple_lock_held(&vp->v_interlock));
 1612 
 1613         /*
 1614          * If purging an active vnode, it must be closed and
 1615          * deactivated before being reclaimed. Note that the
 1616          * VOP_INACTIVE will unlock the vnode.
 1617          */
 1618         if (active) {
 1619                 VOP_INACTIVE(vp, l);
 1620         } else {
 1621                 /*
 1622                  * Any other processes trying to obtain this lock must first
 1623                  * wait for VXLOCK to clear, then call the new lock operation.
 1624                  */
 1625                 VOP_UNLOCK(vp, 0);
 1626         }
 1627         /*
 1628          * Reclaim the vnode.
 1629          */
 1630         if (VOP_RECLAIM(vp, l))
 1631                 panic("vclean: cannot reclaim, vp %p", vp);
 1632         if (active) {
 1633                 /*
 1634                  * Inline copy of vrele() since VOP_INACTIVE
 1635                  * has already been called.
 1636                  */
 1637                 simple_lock(&vp->v_interlock);
 1638                 if (--vp->v_usecount <= 0) {
 1639 #ifdef DIAGNOSTIC
 1640                         if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 1641                                 vprint("vclean: bad ref count", vp);
 1642                                 panic("vclean: ref cnt");
 1643                         }
 1644 #endif
 1645                         /*
 1646                          * Insert at tail of LRU list.
 1647                          */
 1648 
 1649                         simple_unlock(&vp->v_interlock);
 1650                         simple_lock(&vnode_free_list_slock);
 1651 #ifdef DIAGNOSTIC
 1652                         if (vp->v_holdcnt > 0)
 1653                                 panic("vclean: not clean, vp %p", vp);
 1654 #endif
 1655                         TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 1656                         simple_unlock(&vnode_free_list_slock);
 1657                 } else
 1658                         simple_unlock(&vp->v_interlock);
 1659         }
 1660 
 1661         KASSERT(vp->v_uobj.uo_npages == 0);
 1662         if (vp->v_type == VREG && vp->v_ractx != NULL) {
 1663                 uvm_ra_freectx(vp->v_ractx);
 1664                 vp->v_ractx = NULL;
 1665         }
 1666         cache_purge(vp);
 1667 
 1668         /*
 1669          * Done with purge, notify sleepers of the grim news.
 1670          */
 1671         vp->v_op = dead_vnodeop_p;
 1672         vp->v_tag = VT_NON;
 1673         vp->v_vnlock = NULL;
 1674         simple_lock(&vp->v_interlock);
 1675         VN_KNOTE(vp, NOTE_REVOKE);      /* FreeBSD has this in vn_pollgone() */
 1676         vp->v_flag &= ~(VXLOCK|VLOCKSWORK);
 1677         if (vp->v_flag & VXWANT) {
 1678                 vp->v_flag &= ~VXWANT;
 1679                 simple_unlock(&vp->v_interlock);
 1680                 wakeup((caddr_t)vp);
 1681         } else
 1682                 simple_unlock(&vp->v_interlock);
 1683 }
 1684 
 1685 /*
 1686  * Recycle an unused vnode to the front of the free list.
 1687  * Release the passed interlock if the vnode will be recycled.
 1688  */
 1689 int
 1690 vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct lwp *l)
 1691 {
 1692 
 1693         simple_lock(&vp->v_interlock);
 1694         if (vp->v_usecount == 0) {
 1695                 if (inter_lkp)
 1696                         simple_unlock(inter_lkp);
 1697                 vgonel(vp, l);
 1698                 return (1);
 1699         }
 1700         simple_unlock(&vp->v_interlock);
 1701         return (0);
 1702 }
 1703 
 1704 /*
 1705  * Eliminate all activity associated with a vnode
 1706  * in preparation for reuse.
 1707  */
 1708 void
 1709 vgone(struct vnode *vp)
 1710 {
 1711         struct lwp *l = curlwp;         /* XXX */
 1712 
 1713         simple_lock(&vp->v_interlock);
 1714         vgonel(vp, l);
 1715 }
 1716 
 1717 /*
 1718  * vgone, with the vp interlock held.
 1719  */
 1720 void
 1721 vgonel(struct vnode *vp, struct lwp *l)
 1722 {
 1723 
 1724         LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
 1725 
 1726         /*
 1727          * If a vgone (or vclean) is already in progress,
 1728          * wait until it is done and return.
 1729          */
 1730 
 1731         if (vp->v_flag & VXLOCK) {
 1732                 vp->v_flag |= VXWANT;
 1733                 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock);
 1734                 return;
 1735         }
 1736 
 1737         /*
 1738          * Clean out the filesystem specific data.
 1739          */
 1740 
 1741         vclean(vp, DOCLOSE, l);
 1742         KASSERT((vp->v_flag & VONWORKLST) == 0);
 1743 
 1744         /*
 1745          * Delete from old mount point vnode list, if on one.
 1746          */
 1747 
 1748         if (vp->v_mount != NULL)
 1749                 insmntque(vp, (struct mount *)0);
 1750 
 1751         /*
 1752          * The test of the back pointer and the reference count of
 1753          * zero is because it will be removed from the free list by
 1754          * getcleanvnode, but will not have its reference count
 1755          * incremented until after calling vgone. If the reference
 1756          * count were incremented first, vgone would (incorrectly)
 1757          * try to close the previous instance of the underlying object.
 1758          * So, the back pointer is explicitly set to `0xdeadb' in
 1759          * getnewvnode after removing it from the freelist to ensure
 1760          * that we do not try to move it here.
 1761          */
 1762 
 1763         vp->v_type = VBAD;
 1764         if (vp->v_usecount == 0) {
 1765                 boolean_t dofree;
 1766 
 1767                 simple_lock(&vnode_free_list_slock);
 1768                 if (vp->v_holdcnt > 0)
 1769                         panic("vgonel: not clean, vp %p", vp);
 1770                 /*
 1771                  * if it isn't on the freelist, we're called by getcleanvnode
 1772                  * and vnode is being re-used.  otherwise, we'll free it.
 1773                  */
 1774                 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb;
 1775                 if (dofree) {
 1776                         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 1777                         numvnodes--;
 1778                 }
 1779                 simple_unlock(&vnode_free_list_slock);
 1780                 if (dofree)
 1781                         pool_put(&vnode_pool, vp);
 1782         }
 1783 }
 1784 
 1785 /*
 1786  * Lookup a vnode by device number.
 1787  */
 1788 int
 1789 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
 1790 {
 1791         struct vnode *vp;
 1792         int rc = 0;
 1793 
 1794         simple_lock(&spechash_slock);
 1795         for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
 1796                 if (dev != vp->v_rdev || type != vp->v_type)
 1797                         continue;
 1798                 *vpp = vp;
 1799                 rc = 1;
 1800                 break;
 1801         }
 1802         simple_unlock(&spechash_slock);
 1803         return (rc);
 1804 }
 1805 
 1806 /*
 1807  * Revoke all the vnodes corresponding to the specified minor number
 1808  * range (endpoints inclusive) of the specified major.
 1809  */
 1810 void
 1811 vdevgone(int maj, int minl, int minh, enum vtype type)
 1812 {
 1813         struct vnode *vp;
 1814         int mn;
 1815 
 1816         vp = NULL;      /* XXX gcc */
 1817 
 1818         for (mn = minl; mn <= minh; mn++)
 1819                 if (vfinddev(makedev(maj, mn), type, &vp))
 1820                         VOP_REVOKE(vp, REVOKEALL);
 1821 }
 1822 
 1823 /*
 1824  * Calculate the total number of references to a special device.
 1825  */
 1826 int
 1827 vcount(struct vnode *vp)
 1828 {
 1829         struct vnode *vq, *vnext;
 1830         int count;
 1831 
 1832 loop:
 1833         if ((vp->v_flag & VALIASED) == 0)
 1834                 return (vp->v_usecount);
 1835         simple_lock(&spechash_slock);
 1836         for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
 1837                 vnext = vq->v_specnext;
 1838                 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
 1839                         continue;
 1840                 /*
 1841                  * Alias, but not in use, so flush it out.
 1842                  */
 1843                 if (vq->v_usecount == 0 && vq != vp &&
 1844                     (vq->v_flag & VXLOCK) == 0) {
 1845                         simple_unlock(&spechash_slock);
 1846                         vgone(vq);
 1847                         goto loop;
 1848                 }
 1849                 count += vq->v_usecount;
 1850         }
 1851         simple_unlock(&spechash_slock);
 1852         return (count);
 1853 }
 1854 
 1855 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 1856 #define ARRAY_PRINT(idx, arr) \
 1857     ((idx) > 0 && (idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
 1858 
 1859 const char * const vnode_tags[] = { VNODE_TAGS };
 1860 const char * const vnode_types[] = { VNODE_TYPES };
 1861 const char vnode_flagbits[] = VNODE_FLAGBITS;
 1862 
 1863 /*
 1864  * Print out a description of a vnode.
 1865  */
 1866 void
 1867 vprint(const char *label, struct vnode *vp)
 1868 {
 1869         char bf[96];
 1870 
 1871         if (label != NULL)
 1872                 printf("%s: ", label);
 1873         printf("tag %s(%d) type %s(%d), usecount %d, writecount %ld, "
 1874             "refcount %ld,", ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
 1875             ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
 1876             vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
 1877         bitmask_snprintf(vp->v_flag, vnode_flagbits, bf, sizeof(bf));
 1878         if (bf[0] != '\0')
 1879                 printf(" flags (%s)", &bf[1]);
 1880         if (vp->v_data == NULL) {
 1881                 printf("\n");
 1882         } else {
 1883                 printf("\n\t");
 1884                 VOP_PRINT(vp);
 1885         }
 1886 }
 1887 
 1888 #ifdef DEBUG
 1889 /*
 1890  * List all of the locked vnodes in the system.
 1891  * Called when debugging the kernel.
 1892  */
 1893 void
 1894 printlockedvnodes(void)
 1895 {
 1896         struct mount *mp, *nmp;
 1897         struct vnode *vp;
 1898 
 1899         printf("Locked vnodes\n");
 1900         simple_lock(&mountlist_slock);
 1901         for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
 1902              mp = nmp) {
 1903                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
 1904                         nmp = CIRCLEQ_NEXT(mp, mnt_list);
 1905                         continue;
 1906                 }
 1907                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 1908                         if (VOP_ISLOCKED(vp))
 1909                                 vprint(NULL, vp);
 1910                 }
 1911                 simple_lock(&mountlist_slock);
 1912                 nmp = CIRCLEQ_NEXT(mp, mnt_list);
 1913                 vfs_unbusy(mp);
 1914         }
 1915         simple_unlock(&mountlist_slock);
 1916 }
 1917 #endif
 1918 
 1919 /*
 1920  * sysctl helper routine to return list of supported fstypes
 1921  */
 1922 static int
 1923 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
 1924 {
 1925         char bf[MFSNAMELEN];
 1926         char *where = oldp;
 1927         struct vfsops *v;
 1928         size_t needed, left, slen;
 1929         int error, first;
 1930 
 1931         if (newp != NULL)
 1932                 return (EPERM);
 1933         if (namelen != 0)
 1934                 return (EINVAL);
 1935 
 1936         first = 1;
 1937         error = 0;
 1938         needed = 0;
 1939         left = *oldlenp;
 1940 
 1941         LIST_FOREACH(v, &vfs_list, vfs_list) {
 1942                 if (where == NULL)
 1943                         needed += strlen(v->vfs_name) + 1;
 1944                 else {
 1945                         memset(bf, 0, sizeof(bf));
 1946                         if (first) {
 1947                                 strncpy(bf, v->vfs_name, sizeof(bf));
 1948                                 first = 0;
 1949                         } else {
 1950                                 bf[0] = ' ';
 1951                                 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
 1952                         }
 1953                         bf[sizeof(bf)-1] = '\0';
 1954                         slen = strlen(bf);
 1955                         if (left < slen + 1)
 1956                                 break;
 1957                         /* +1 to copy out the trailing NUL byte */
 1958                         error = copyout(bf, where, slen + 1);
 1959                         if (error)
 1960                                 break;
 1961                         where += slen;
 1962                         needed += slen;
 1963                         left -= slen;
 1964                 }
 1965         }
 1966         *oldlenp = needed;
 1967         return (error);
 1968 }
 1969 
 1970 /*
 1971  * Top level filesystem related information gathering.
 1972  */
 1973 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
 1974 {
 1975         sysctl_createv(clog, 0, NULL, NULL,
 1976                        CTLFLAG_PERMANENT,
 1977                        CTLTYPE_NODE, "vfs", NULL,
 1978                        NULL, 0, NULL, 0,
 1979                        CTL_VFS, CTL_EOL);
 1980         sysctl_createv(clog, 0, NULL, NULL,
 1981                        CTLFLAG_PERMANENT,
 1982                        CTLTYPE_NODE, "generic",
 1983                        SYSCTL_DESCR("Non-specific vfs related information"),
 1984                        NULL, 0, NULL, 0,
 1985                        CTL_VFS, VFS_GENERIC, CTL_EOL);
 1986         sysctl_createv(clog, 0, NULL, NULL,
 1987                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1988                        CTLTYPE_INT, "usermount",
 1989                        SYSCTL_DESCR("Whether unprivileged users may mount "
 1990                                     "filesystems"),
 1991                        NULL, 0, &dovfsusermount, 0,
 1992                        CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
 1993         sysctl_createv(clog, 0, NULL, NULL,
 1994                        CTLFLAG_PERMANENT,
 1995                        CTLTYPE_STRING, "fstypes",
 1996                        SYSCTL_DESCR("List of file systems present"),
 1997                        sysctl_vfs_generic_fstypes, 0, NULL, 0,
 1998                        CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
 1999         sysctl_createv(clog, 0, NULL, NULL,
 2000                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 2001                        CTLTYPE_INT, "magiclinks",
 2002                        SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
 2003                        NULL, 0, &vfs_magiclinks, 0,
 2004                        CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
 2005 }
 2006 
 2007 
 2008 int kinfo_vdebug = 1;
 2009 int kinfo_vgetfailed;
 2010 #define KINFO_VNODESLOP 10
 2011 /*
 2012  * Dump vnode list (via sysctl).
 2013  * Copyout address of vnode followed by vnode.
 2014  */
 2015 /* ARGSUSED */
 2016 int
 2017 sysctl_kern_vnode(SYSCTLFN_ARGS)
 2018 {
 2019         char *where = oldp;
 2020         size_t *sizep = oldlenp;
 2021         struct mount *mp, *nmp;
 2022         struct vnode *vp;
 2023         char *bp = where, *savebp;
 2024         char *ewhere;
 2025         int error;
 2026 
 2027         if (namelen != 0)
 2028                 return (EOPNOTSUPP);
 2029         if (newp != NULL)
 2030                 return (EPERM);
 2031 
 2032 #define VPTRSZ  sizeof(struct vnode *)
 2033 #define VNODESZ sizeof(struct vnode)
 2034         if (where == NULL) {
 2035                 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
 2036                 return (0);
 2037         }
 2038         ewhere = where + *sizep;
 2039 
 2040         simple_lock(&mountlist_slock);
 2041         for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
 2042              mp = nmp) {
 2043                 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
 2044                         nmp = CIRCLEQ_NEXT(mp, mnt_list);
 2045                         continue;
 2046                 }
 2047                 savebp = bp;
 2048 again:
 2049                 simple_lock(&mntvnode_slock);
 2050                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 2051                         /*
 2052                          * Check that the vp is still associated with
 2053                          * this filesystem.  RACE: could have been
 2054                          * recycled onto the same filesystem.
 2055                          */
 2056                         if (vp->v_mount != mp) {
 2057                                 simple_unlock(&mntvnode_slock);
 2058                                 if (kinfo_vdebug)
 2059                                         printf("kinfo: vp changed\n");
 2060                                 bp = savebp;
 2061                                 goto again;
 2062                         }
 2063                         if (bp + VPTRSZ + VNODESZ > ewhere) {
 2064                                 simple_unlock(&mntvnode_slock);
 2065                                 *sizep = bp - where;
 2066                                 return (ENOMEM);
 2067                         }
 2068                         simple_unlock(&mntvnode_slock);
 2069                         if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
 2070                            (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
 2071                                 return (error);
 2072                         bp += VPTRSZ + VNODESZ;
 2073                         simple_lock(&mntvnode_slock);
 2074                 }
 2075                 simple_unlock(&mntvnode_slock);
 2076                 simple_lock(&mountlist_slock);
 2077                 nmp = CIRCLEQ_NEXT(mp, mnt_list);
 2078                 vfs_unbusy(mp);
 2079         }
 2080         simple_unlock(&mountlist_slock);
 2081 
 2082         *sizep = bp - where;
 2083         return (0);
 2084 }
 2085 
 2086 /*
 2087  * Check to see if a filesystem is mounted on a block device.
 2088  */
 2089 int
 2090 vfs_mountedon(struct vnode *vp)
 2091 {
 2092         struct vnode *vq;
 2093         int error = 0;
 2094 
 2095         if (vp->v_type != VBLK)
 2096                 return ENOTBLK;
 2097         if (vp->v_specmountpoint != NULL)
 2098                 return (EBUSY);
 2099         if (vp->v_flag & VALIASED) {
 2100                 simple_lock(&spechash_slock);
 2101                 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 2102                         if (vq->v_rdev != vp->v_rdev ||
 2103                             vq->v_type != vp->v_type)
 2104                                 continue;
 2105                         if (vq->v_specmountpoint != NULL) {
 2106                                 error = EBUSY;
 2107                                 break;
 2108                         }
 2109                 }
 2110                 simple_unlock(&spechash_slock);
 2111         }
 2112         return (error);
 2113 }
 2114 
 2115 /*
 2116  * Do the usual access checking.
 2117  * file_mode, uid and gid are from the vnode in question,
 2118  * while acc_mode and cred are from the VOP_ACCESS parameter list
 2119  */
 2120 int
 2121 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
 2122     mode_t acc_mode, kauth_cred_t cred)
 2123 {
 2124         mode_t mask;
 2125         int error, ismember;
 2126 
 2127         /*
 2128          * Super-user always gets read/write access, but execute access depends
 2129          * on at least one execute bit being set.
 2130          */
 2131         if (kauth_cred_geteuid(cred) == 0) {
 2132                 if ((acc_mode & VEXEC) && type != VDIR &&
 2133                     (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
 2134                         return (EACCES);
 2135                 return (0);
 2136         }
 2137 
 2138         mask = 0;
 2139 
 2140         /* Otherwise, check the owner. */
 2141         if (kauth_cred_geteuid(cred) == uid) {
 2142                 if (acc_mode & VEXEC)
 2143                         mask |= S_IXUSR;
 2144                 if (acc_mode & VREAD)
 2145                         mask |= S_IRUSR;
 2146                 if (acc_mode & VWRITE)
 2147                         mask |= S_IWUSR;
 2148                 return ((file_mode & mask) == mask ? 0 : EACCES);
 2149         }
 2150 
 2151         /* Otherwise, check the groups. */
 2152         error = kauth_cred_ismember_gid(cred, gid, &ismember);
 2153         if (error)
 2154                 return (error);
 2155         if (kauth_cred_getegid(cred) == gid || ismember) {
 2156                 if (acc_mode & VEXEC)
 2157                         mask |= S_IXGRP;
 2158                 if (acc_mode & VREAD)
 2159                         mask |= S_IRGRP;
 2160                 if (acc_mode & VWRITE)
 2161                         mask |= S_IWGRP;
 2162                 return ((file_mode & mask) == mask ? 0 : EACCES);
 2163         }
 2164 
 2165         /* Otherwise, check everyone else. */
 2166         if (acc_mode & VEXEC)
 2167                 mask |= S_IXOTH;
 2168         if (acc_mode & VREAD)
 2169                 mask |= S_IROTH;
 2170         if (acc_mode & VWRITE)
 2171                 mask |= S_IWOTH;
 2172         return ((file_mode & mask) == mask ? 0 : EACCES);
 2173 }
 2174 
 2175 /*
 2176  * Unmount all file systems.
 2177  * We traverse the list in reverse order under the assumption that doing so
 2178  * will avoid needing to worry about dependencies.
 2179  */
 2180 void
 2181 vfs_unmountall(struct lwp *l)
 2182 {
 2183         struct mount *mp, *nmp;
 2184         int allerror, error;
 2185 
 2186         printf("unmounting file systems...");
 2187         for (allerror = 0,
 2188              mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
 2189                 nmp = mp->mnt_list.cqe_prev;
 2190 #ifdef DEBUG
 2191                 printf("\nunmounting %s (%s)...",
 2192                     mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
 2193 #endif
 2194                 /*
 2195                  * XXX Freeze syncer.  Must do this before locking the
 2196                  * mount point.  See dounmount() for details.
 2197                  */
 2198                 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL);
 2199                 if (vfs_busy(mp, 0, 0)) {
 2200                         lockmgr(&syncer_lock, LK_RELEASE, NULL);
 2201                         continue;
 2202                 }
 2203                 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
 2204                         printf("unmount of %s failed with error %d\n",
 2205                             mp->mnt_stat.f_mntonname, error);
 2206                         allerror = 1;
 2207                 }
 2208         }
 2209         printf(" done\n");
 2210         if (allerror)
 2211                 printf("WARNING: some file systems would not unmount\n");
 2212 }
 2213 
 2214 extern struct simplelock bqueue_slock; /* XXX */
 2215 
 2216 /*
 2217  * Sync and unmount file systems before shutting down.
 2218  */
 2219 void
 2220 vfs_shutdown(void)
 2221 {
 2222         struct lwp *l;
 2223 
 2224         /* XXX we're certainly not running in lwp0's context! */
 2225         l = curlwp;
 2226         if (l == NULL)
 2227                 l = &lwp0;
 2228 
 2229         printf("syncing disks... ");
 2230 
 2231         /* remove user process from run queue */
 2232         suspendsched();
 2233         (void) spl0();
 2234 
 2235         /* avoid coming back this way again if we panic. */
 2236         doing_shutdown = 1;
 2237 
 2238         sys_sync(l, NULL, NULL);
 2239 
 2240         /* Wait for sync to finish. */
 2241         if (buf_syncwait() != 0) {
 2242 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
 2243                 Debugger();
 2244 #endif
 2245                 printf("giving up\n");
 2246                 return;
 2247         } else
 2248                 printf("done\n");
 2249 
 2250         /*
 2251          * If we've panic'd, don't make the situation potentially
 2252          * worse by unmounting the file systems.
 2253          */
 2254         if (panicstr != NULL)
 2255                 return;
 2256 
 2257         /* Release inodes held by texts before update. */
 2258 #ifdef notdef
 2259         vnshutdown();
 2260 #endif
 2261         /* Unmount file systems. */
 2262         vfs_unmountall(l);
 2263 }
 2264 
 2265 /*
 2266  * Mount the root file system.  If the operator didn't specify a
 2267  * file system to use, try all possible file systems until one
 2268  * succeeds.
 2269  */
 2270 int
 2271 vfs_mountroot(void)
 2272 {
 2273         struct vfsops *v;
 2274         int error = ENODEV;
 2275 
 2276         if (root_device == NULL)
 2277                 panic("vfs_mountroot: root device unknown");
 2278 
 2279         switch (device_class(root_device)) {
 2280         case DV_IFNET:
 2281                 if (rootdev != NODEV)
 2282                         panic("vfs_mountroot: rootdev set for DV_IFNET "
 2283                             "(0x%08x -> %d,%d)", rootdev,
 2284                             major(rootdev), minor(rootdev));
 2285                 break;
 2286 
 2287         case DV_DISK:
 2288                 if (rootdev == NODEV)
 2289                         panic("vfs_mountroot: rootdev not set for DV_DISK");
 2290                 if (bdevvp(rootdev, &rootvp))
 2291                         panic("vfs_mountroot: can't get vnode for rootdev");
 2292                 error = VOP_OPEN(rootvp, FREAD, FSCRED, curlwp);
 2293                 if (error) {
 2294                         printf("vfs_mountroot: can't open root device\n");
 2295                         return (error);
 2296                 }
 2297                 break;
 2298 
 2299         default:
 2300                 printf("%s: inappropriate for root file system\n",
 2301                     root_device->dv_xname);
 2302                 return (ENODEV);
 2303         }
 2304 
 2305         /*
 2306          * If user specified a file system, use it.
 2307          */
 2308         if (mountroot != NULL) {
 2309                 error = (*mountroot)();
 2310                 goto done;
 2311         }
 2312 
 2313         /*
 2314          * Try each file system currently configured into the kernel.
 2315          */
 2316         LIST_FOREACH(v, &vfs_list, vfs_list) {
 2317                 if (v->vfs_mountroot == NULL)
 2318                         continue;
 2319 #ifdef DEBUG
 2320                 aprint_normal("mountroot: trying %s...\n", v->vfs_name);
 2321 #endif
 2322                 error = (*v->vfs_mountroot)();
 2323                 if (!error) {
 2324                         aprint_normal("root file system type: %s\n",
 2325                             v->vfs_name);
 2326                         break;
 2327                 }
 2328         }
 2329 
 2330         if (v == NULL) {
 2331                 printf("no file system for %s", root_device->dv_xname);
 2332                 if (device_class(root_device) == DV_DISK)
 2333                         printf(" (dev 0x%x)", rootdev);
 2334                 printf("\n");
 2335                 error = EFTYPE;
 2336         }
 2337 
 2338 done:
 2339         if (error && device_class(root_device) == DV_DISK) {
 2340                 VOP_CLOSE(rootvp, FREAD, FSCRED, curlwp);
 2341                 vrele(rootvp);
 2342         }
 2343         return (error);
 2344 }
 2345 
 2346 /*
 2347  * Given a file system name, look up the vfsops for that
 2348  * file system, or return NULL if file system isn't present
 2349  * in the kernel.
 2350  */
 2351 struct vfsops *
 2352 vfs_getopsbyname(const char *name)
 2353 {
 2354         struct vfsops *v;
 2355 
 2356         LIST_FOREACH(v, &vfs_list, vfs_list) {
 2357                 if (strcmp(v->vfs_name, name) == 0)
 2358                         break;
 2359         }
 2360 
 2361         return (v);
 2362 }
 2363 
 2364 /*
 2365  * Establish a file system and initialize it.
 2366  */
 2367 int
 2368 vfs_attach(struct vfsops *vfs)
 2369 {
 2370         struct vfsops *v;
 2371         int error = 0;
 2372 
 2373 
 2374         /*
 2375          * Make sure this file system doesn't already exist.
 2376          */
 2377         LIST_FOREACH(v, &vfs_list, vfs_list) {
 2378                 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
 2379                         error = EEXIST;
 2380                         goto out;
 2381                 }
 2382         }
 2383 
 2384         /*
 2385          * Initialize the vnode operations for this file system.
 2386          */
 2387         vfs_opv_init(vfs->vfs_opv_descs);
 2388 
 2389         /*
 2390          * Now initialize the file system itself.
 2391          */
 2392         (*vfs->vfs_init)();
 2393 
 2394         /*
 2395          * ...and link it into the kernel's list.
 2396          */
 2397         LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);
 2398 
 2399         /*
 2400          * Sanity: make sure the reference count is 0.
 2401          */
 2402         vfs->vfs_refcount = 0;
 2403 
 2404  out:
 2405         return (error);
 2406 }
 2407 
 2408 /*
 2409  * Remove a file system from the kernel.
 2410  */
 2411 int
 2412 vfs_detach(struct vfsops *vfs)
 2413 {
 2414         struct vfsops *v;
 2415 
 2416         /*
 2417          * Make sure no one is using the filesystem.
 2418          */
 2419         if (vfs->vfs_refcount != 0)
 2420                 return (EBUSY);
 2421 
 2422         /*
 2423          * ...and remove it from the kernel's list.
 2424          */
 2425         LIST_FOREACH(v, &vfs_list, vfs_list) {
 2426                 if (v == vfs) {
 2427                         LIST_REMOVE(v, vfs_list);
 2428                         break;
 2429                 }
 2430         }
 2431 
 2432         if (v == NULL)
 2433                 return (ESRCH);
 2434 
 2435         /*
 2436          * Now run the file system-specific cleanups.
 2437          */
 2438         (*vfs->vfs_done)();
 2439 
 2440         /*
 2441          * Free the vnode operations vector.
 2442          */
 2443         vfs_opv_free(vfs->vfs_opv_descs);
 2444         return (0);
 2445 }
 2446 
 2447 void
 2448 vfs_reinit(void)
 2449 {
 2450         struct vfsops *vfs;
 2451 
 2452         LIST_FOREACH(vfs, &vfs_list, vfs_list) {
 2453                 if (vfs->vfs_reinit) {
 2454                         (*vfs->vfs_reinit)();
 2455                 }
 2456         }
 2457 }
 2458 
 2459 /*
 2460  * Request a filesystem to suspend write operations.
 2461  */
 2462 int
 2463 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo)
 2464 {
 2465         struct lwp *l = curlwp; /* XXX */
 2466         int error;
 2467 
 2468         while ((mp->mnt_iflag & IMNT_SUSPEND)) {
 2469                 if (slptimeo < 0)
 2470                         return EWOULDBLOCK;
 2471                 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo);
 2472                 if (error)
 2473                         return error;
 2474         }
 2475         mp->mnt_iflag |= IMNT_SUSPEND;
 2476 
 2477         simple_lock(&mp->mnt_slock);
 2478         if (mp->mnt_writeopcountupper > 0)
 2479                 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt",
 2480                         0, &mp->mnt_slock);
 2481         simple_unlock(&mp->mnt_slock);
 2482 
 2483         error = VFS_SYNC(mp, MNT_WAIT, l->l_cred, l);
 2484         if (error) {
 2485                 vfs_write_resume(mp);
 2486                 return error;
 2487         }
 2488         mp->mnt_iflag |= IMNT_SUSPENDLOW;
 2489 
 2490         simple_lock(&mp->mnt_slock);
 2491         if (mp->mnt_writeopcountlower > 0)
 2492                 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt",
 2493                         0, &mp->mnt_slock);
 2494         mp->mnt_iflag |= IMNT_SUSPENDED;
 2495         simple_unlock(&mp->mnt_slock);
 2496 
 2497         return 0;
 2498 }
 2499 
 2500 /*
 2501  * Request a filesystem to resume write operations.
 2502  */
 2503 void
 2504 vfs_write_resume(struct mount *mp)
 2505 {
 2506 
 2507         if ((mp->mnt_iflag & IMNT_SUSPEND) == 0)
 2508                 return;
 2509         mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED);
 2510         wakeup(&mp->mnt_flag);
 2511 }
 2512 
 2513 void
 2514 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
 2515 {
 2516         const struct statvfs *mbp;
 2517 
 2518         if (sbp == (mbp = &mp->mnt_stat))
 2519                 return;
 2520 
 2521         (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
 2522         sbp->f_fsid = mbp->f_fsid;
 2523         sbp->f_owner = mbp->f_owner;
 2524         sbp->f_flag = mbp->f_flag;
 2525         sbp->f_syncwrites = mbp->f_syncwrites;
 2526         sbp->f_asyncwrites = mbp->f_asyncwrites;
 2527         sbp->f_syncreads = mbp->f_syncreads;
 2528         sbp->f_asyncreads = mbp->f_asyncreads;
 2529         (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
 2530         (void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
 2531             sizeof(sbp->f_fstypename));
 2532         (void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
 2533             sizeof(sbp->f_mntonname));
 2534         (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
 2535             sizeof(sbp->f_mntfromname));
 2536         sbp->f_namemax = mbp->f_namemax;
 2537 }
 2538 
 2539 int
 2540 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
 2541     struct mount *mp, struct lwp *l)
 2542 {
 2543         int error;
 2544         size_t size;
 2545         struct statvfs *sfs = &mp->mnt_stat;
 2546         int (*fun)(const void *, void *, size_t, size_t *);
 2547 
 2548         (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name,
 2549             sizeof(mp->mnt_stat.f_fstypename));
 2550 
 2551         if (onp) {
 2552                 struct cwdinfo *cwdi = l->l_proc->p_cwdi;
 2553                 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
 2554                 if (cwdi->cwdi_rdir != NULL) {
 2555                         size_t len;
 2556                         char *bp;
 2557                         char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 2558 
 2559                         if (!path) /* XXX can't happen with M_WAITOK */
 2560                                 return ENOMEM;
 2561 
 2562                         bp = path + MAXPATHLEN;
 2563                         *--bp = '\0';
 2564                         error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
 2565                             path, MAXPATHLEN / 2, 0, l);
 2566                         if (error) {
 2567                                 free(path, M_TEMP);
 2568                                 return error;
 2569                         }
 2570 
 2571                         len = strlen(bp);
 2572                         if (len > sizeof(sfs->f_mntonname) - 1)
 2573                                 len = sizeof(sfs->f_mntonname) - 1;
 2574                         (void)strncpy(sfs->f_mntonname, bp, len);
 2575                         free(path, M_TEMP);
 2576 
 2577                         if (len < sizeof(sfs->f_mntonname) - 1) {
 2578                                 error = (*fun)(onp, &sfs->f_mntonname[len],
 2579                                     sizeof(sfs->f_mntonname) - len - 1, &size);
 2580                                 if (error)
 2581                                         return error;
 2582                                 size += len;
 2583                         } else {
 2584                                 size = len;
 2585                         }
 2586                 } else {
 2587                         error = (*fun)(onp, &sfs->f_mntonname,
 2588                             sizeof(sfs->f_mntonname) - 1, &size);
 2589                         if (error)
 2590                                 return error;
 2591                 }
 2592                 (void)memset(sfs->f_mntonname + size, 0,
 2593                     sizeof(sfs->f_mntonname) - size);
 2594         }
 2595 
 2596         if (fromp) {
 2597                 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
 2598                 error = (*fun)(fromp, sfs->f_mntfromname,
 2599                     sizeof(sfs->f_mntfromname) - 1, &size);
 2600                 if (error)
 2601                         return error;
 2602                 (void)memset(sfs->f_mntfromname + size, 0,
 2603                     sizeof(sfs->f_mntfromname) - size);
 2604         }
 2605         return 0;
 2606 }
 2607 
 2608 void
 2609 vfs_timestamp(struct timespec *ts)
 2610 {
 2611 
 2612         nanotime(ts);
 2613 }
 2614 
 2615 /*
 2616  * mount_specific_key_create --
 2617  *      Create a key for subsystem mount-specific data.
 2618  */
 2619 int
 2620 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
 2621 {
 2622 
 2623         return (specificdata_key_create(mount_specificdata_domain, keyp, dtor));
 2624 }
 2625 
 2626 /*
 2627  * mount_specific_key_delete --
 2628  *      Delete a key for subsystem mount-specific data.
 2629  */
 2630 void
 2631 mount_specific_key_delete(specificdata_key_t key)
 2632 {
 2633 
 2634         specificdata_key_delete(mount_specificdata_domain, key);
 2635 }
 2636 
 2637 /*
 2638  * mount_initspecific --
 2639  *      Initialize a mount's specificdata container.
 2640  */
 2641 void
 2642 mount_initspecific(struct mount *mp)
 2643 {
 2644         int error;
 2645 
 2646         error = specificdata_init(mount_specificdata_domain,
 2647                                   &mp->mnt_specdataref);
 2648         KASSERT(error == 0);
 2649 }
 2650 
 2651 /*
 2652  * mount_finispecific --
 2653  *      Finalize a mount's specificdata container.
 2654  */
 2655 void
 2656 mount_finispecific(struct mount *mp)
 2657 {
 2658 
 2659         specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
 2660 }
 2661 
 2662 /*
 2663  * mount_getspecific --
 2664  *      Return mount-specific data corresponding to the specified key.
 2665  */
 2666 void *
 2667 mount_getspecific(struct mount *mp, specificdata_key_t key)
 2668 {
 2669 
 2670         return (specificdata_getspecific(mount_specificdata_domain,
 2671                                          &mp->mnt_specdataref, key));
 2672 }
 2673 
 2674 /*
 2675  * mount_setspecific --
 2676  *      Set mount-specific data corresponding to the specified key.
 2677  */
 2678 void
 2679 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
 2680 {
 2681 
 2682         specificdata_setspecific(mount_specificdata_domain,
 2683                                  &mp->mnt_specdataref, key, data);
 2684 }
 2685 
 2686 #ifdef DDB
 2687 static const char buf_flagbits[] = BUF_FLAGBITS;
 2688 
 2689 void
 2690 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
 2691 {
 2692         char bf[1024];
 2693 
 2694         (*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
 2695             PRIx64 " dev 0x%x\n",
 2696             bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
 2697 
 2698         bitmask_snprintf(bp->b_flags, buf_flagbits, bf, sizeof(bf));
 2699         (*pr)("  error %d flags 0x%s\n", bp->b_error, bf);
 2700 
 2701         (*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
 2702                   bp->b_bufsize, bp->b_bcount, bp->b_resid);
 2703         (*pr)("  data %p saveaddr %p dep %p\n",
 2704                   bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep));
 2705         (*pr)("  iodone %p\n", bp->b_iodone);
 2706 }
 2707 
 2708 
 2709 void
 2710 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
 2711 {
 2712         char bf[256];
 2713 
 2714         uvm_object_printit(&vp->v_uobj, full, pr);
 2715         bitmask_snprintf(vp->v_flag, vnode_flagbits, bf, sizeof(bf));
 2716         (*pr)("\nVNODE flags %s\n", bf);
 2717         (*pr)("mp %p numoutput %d size 0x%llx\n",
 2718               vp->v_mount, vp->v_numoutput, vp->v_size);
 2719 
 2720         (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n",
 2721               vp->v_data, vp->v_usecount, vp->v_writecount,
 2722               vp->v_holdcnt, vp->v_numoutput);
 2723 
 2724         (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
 2725               ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
 2726               ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
 2727               vp->v_mount, vp->v_mountedhere);
 2728 
 2729         if (full) {
 2730                 struct buf *bp;
 2731 
 2732                 (*pr)("clean bufs:\n");
 2733                 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
 2734                         (*pr)(" bp %p\n", bp);
 2735                         vfs_buf_print(bp, full, pr);
 2736                 }
 2737 
 2738                 (*pr)("dirty bufs:\n");
 2739                 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
 2740                         (*pr)(" bp %p\n", bp);
 2741                         vfs_buf_print(bp, full, pr);
 2742                 }
 2743         }
 2744 }
 2745 
 2746 void
 2747 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
 2748 {
 2749         char sbuf[256];
 2750 
 2751         (*pr)("vnodecovered = %p syncer = %p data = %p\n",
 2752                         mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data);
 2753 
 2754         (*pr)("fs_bshift %d dev_bshift = %d\n",
 2755                         mp->mnt_fs_bshift,mp->mnt_dev_bshift);
 2756 
 2757         bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf));
 2758         (*pr)("flag = %s\n", sbuf);
 2759 
 2760         bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf));
 2761         (*pr)("iflag = %s\n", sbuf);
 2762 
 2763         /* XXX use lockmgr_printinfo */
 2764         if (mp->mnt_lock.lk_sharecount)
 2765                 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg,
 2766                     mp->mnt_lock.lk_sharecount);
 2767         else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) {
 2768                 (*pr)(" lock type %s: EXCL (count %d) by ",
 2769                     mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount);
 2770                 if (mp->mnt_lock.lk_flags & LK_SPIN)
 2771                         (*pr)("processor %lu", mp->mnt_lock.lk_cpu);
 2772                 else
 2773                         (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder,
 2774                             mp->mnt_lock.lk_locklwp);
 2775         } else
 2776                 (*pr)(" not locked");
 2777         if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0)
 2778                 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount);
 2779 
 2780         (*pr)("\n");
 2781 
 2782         if (mp->mnt_unmounter) {
 2783                 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->l_proc);
 2784         }
 2785         (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n",
 2786                 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower);
 2787 
 2788         (*pr)("statvfs cache:\n");
 2789         (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize);
 2790         (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize);
 2791         (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize);
 2792 
 2793         (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks);
 2794         (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree);
 2795         (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail);
 2796         (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd);
 2797 
 2798         (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files);
 2799         (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree);
 2800         (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail);
 2801         (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd);
 2802 
 2803         (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
 2804                         mp->mnt_stat.f_fsidx.__fsid_val[0],
 2805                         mp->mnt_stat.f_fsidx.__fsid_val[1]);
 2806 
 2807         (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner);
 2808         (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax);
 2809 
 2810         bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf,
 2811             sizeof(sbuf));
 2812         (*pr)("\tflag = %s\n",sbuf);
 2813         (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites);
 2814         (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites);
 2815         (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads);
 2816         (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads);
 2817         (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename);
 2818         (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname);
 2819         (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname);
 2820 
 2821         {
 2822                 int cnt = 0;
 2823                 struct vnode *vp;
 2824                 (*pr)("locked vnodes =");
 2825                 /* XXX would take mountlist lock, except ddb may not have context */
 2826                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 2827                         if (VOP_ISLOCKED(vp)) {
 2828                                 if ((++cnt % 6) == 0) {
 2829                                         (*pr)(" %p,\n\t", vp);
 2830                                 } else {
 2831                                         (*pr)(" %p,", vp);
 2832                                 }
 2833                         }
 2834                 }
 2835                 (*pr)("\n");
 2836         }
 2837 
 2838         if (full) {
 2839                 int cnt = 0;
 2840                 struct vnode *vp;
 2841                 (*pr)("all vnodes =");
 2842                 /* XXX would take mountlist lock, except ddb may not have context */
 2843                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 2844                         if (!TAILQ_NEXT(vp, v_mntvnodes)) {
 2845                                 (*pr)(" %p", vp);
 2846                         } else if ((++cnt % 6) == 0) {
 2847                                 (*pr)(" %p,\n\t", vp);
 2848                         } else {
 2849                                 (*pr)(" %p,", vp);
 2850                         }
 2851                 }
 2852                 (*pr)("\n", vp);
 2853         }
 2854 }
 2855 #endif /* DDB */
Cache object: 02a675ad15602284f6b871a048abfb08
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_subr.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_subr.c