vfs_subr.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $OpenBSD: vfs_subr.c,v 1.318 2022/12/26 19:25:49 miod Exp $     */
    2 /*      $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $   */
    3 
    4 /*
    5  * Copyright (c) 1989, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  * (c) UNIX System Laboratories, Inc.
    8  * All or some portions of this file are derived from material licensed
    9  * to the University of California by American Telephone and Telegraph
   10  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   11  * the permission of UNIX System Laboratories, Inc.
   12  *
   13  * Redistribution and use in source and binary forms, with or without
   14  * modification, are permitted provided that the following conditions
   15  * are met:
   16  * 1. Redistributions of source code must retain the above copyright
   17  *    notice, this list of conditions and the following disclaimer.
   18  * 2. Redistributions in binary form must reproduce the above copyright
   19  *    notice, this list of conditions and the following disclaimer in the
   20  *    documentation and/or other materials provided with the distribution.
   21  * 3. Neither the name of the University nor the names of its contributors
   22  *    may be used to endorse or promote products derived from this software
   23  *    without specific prior written permission.
   24  *
   25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  *
   37  *      @(#)vfs_subr.c  8.13 (Berkeley) 4/18/94
   38  */
   39 
   40 /*
   41  * External virtual filesystem routines
   42  */
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/proc.h>
   47 #include <sys/sysctl.h>
   48 #include <sys/mount.h>
   49 #include <sys/fcntl.h>
   50 #include <sys/conf.h>
   51 #include <sys/vnode.h>
   52 #include <sys/lock.h>
   53 #include <sys/lockf.h>
   54 #include <sys/stat.h>
   55 #include <sys/acct.h>
   56 #include <sys/namei.h>
   57 #include <sys/ucred.h>
   58 #include <sys/buf.h>
   59 #include <sys/errno.h>
   60 #include <sys/malloc.h>
   61 #include <sys/mbuf.h>
   62 #include <sys/syscallargs.h>
   63 #include <sys/pool.h>
   64 #include <sys/tree.h>
   65 #include <sys/specdev.h>
   66 #include <sys/atomic.h>
   67 
   68 #include <netinet/in.h>
   69 
   70 #include <uvm/uvm_extern.h>
   71 #include <uvm/uvm_vnode.h>
   72 
   73 #include "softraid.h"
   74 
   75 void sr_quiesce(void);
   76 
   77 enum vtype iftovt_tab[16] = {
   78         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
   79         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
   80 };
   81 
   82 int     vttoif_tab[9] = {
   83         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
   84         S_IFSOCK, S_IFIFO, S_IFMT,
   85 };
   86 
   87 int prtactive = 0;              /* 1 => print out reclaim of active vnodes */
   88 int suid_clear = 1;             /* 1 => clear SUID / SGID on owner change */
   89 
   90 /*
   91  * Insq/Remq for the vnode usage lists.
   92  */
   93 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
   94 #define bufremvn(bp) {                                                  \
   95         LIST_REMOVE(bp, b_vnbufs);                                      \
   96         LIST_NEXT(bp, b_vnbufs) = NOLIST;                               \
   97 }
   98 
   99 TAILQ_HEAD(freelst, vnode);
  100 struct freelst vnode_hold_list; /* list of vnodes referencing buffers */
  101 struct freelst vnode_free_list; /* vnode free list */
  102 
  103 struct mntlist mountlist;       /* mounted filesystem list */
  104 
  105 void    vclean(struct vnode *, int, struct proc *);
  106 
  107 void insmntque(struct vnode *, struct mount *);
  108 int getdevvp(dev_t, struct vnode **, enum vtype);
  109 
  110 int vfs_hang_addrlist(struct mount *, struct netexport *,
  111                                   struct export_args *);
  112 int vfs_free_netcred(struct radix_node *, void *, u_int);
  113 void vfs_free_addrlist(struct netexport *);
  114 void vputonfreelist(struct vnode *);
  115 
  116 int vflush_vnode(struct vnode *, void *);
  117 int maxvnodes;
  118 
  119 struct mutex vnode_mtx = MUTEX_INITIALIZER(IPL_BIO);
  120 
  121 void vfs_unmountall(void);
  122 
  123 #ifdef DEBUG
  124 void printlockedvnodes(void);
  125 #endif
  126 
  127 struct pool vnode_pool;
  128 struct pool uvm_vnode_pool;
  129 
  130 static inline int rb_buf_compare(const struct buf *b1, const struct buf *b2);
  131 RBT_GENERATE(buf_rb_bufs, buf, b_rbbufs, rb_buf_compare);
  132 
  133 static inline int
  134 rb_buf_compare(const struct buf *b1, const struct buf *b2)
  135 {
  136         if (b1->b_lblkno < b2->b_lblkno)
  137                 return(-1);
  138         if (b1->b_lblkno > b2->b_lblkno)
  139                 return(1);
  140         return(0);
  141 }
  142 
  143 /*
  144  * Initialize the vnode management data structures.
  145  */
  146 void
  147 vntblinit(void)
  148 {
  149         /* buffer cache may need a vnode for each buffer */
  150         maxvnodes = 2 * initialvnodes;
  151         pool_init(&vnode_pool, sizeof(struct vnode), 0, IPL_NONE,
  152             PR_WAITOK, "vnodes", NULL);
  153         pool_init(&uvm_vnode_pool, sizeof(struct uvm_vnode), 0, IPL_NONE,
  154             PR_WAITOK, "uvmvnodes", NULL);
  155         TAILQ_INIT(&vnode_hold_list);
  156         TAILQ_INIT(&vnode_free_list);
  157         TAILQ_INIT(&mountlist);
  158         /*
  159          * Initialize the filesystem syncer.
  160          */
  161         vn_initialize_syncerd();
  162 
  163 #ifdef NFSSERVER
  164         rn_init(sizeof(struct sockaddr_in));
  165 #endif /* NFSSERVER */
  166 }
  167 
  168 /*
  169  * Allocate a mount point.
  170  *
  171  * The returned mount point is marked as busy.
  172  */
  173 struct mount *
  174 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp)
  175 {
  176         struct mount *mp;
  177 
  178         mp = malloc(sizeof(*mp), M_MOUNT, M_WAITOK|M_ZERO);
  179         rw_init_flags(&mp->mnt_lock, "vfslock", RWL_IS_VNODE);
  180         (void)vfs_busy(mp, VB_READ|VB_NOWAIT);
  181 
  182         TAILQ_INIT(&mp->mnt_vnodelist);
  183         mp->mnt_vnodecovered = vp;
  184 
  185         atomic_inc_int(&vfsp->vfc_refcount);
  186         mp->mnt_vfc = vfsp;
  187         mp->mnt_op = vfsp->vfc_vfsops;
  188         mp->mnt_flag = vfsp->vfc_flags;
  189         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
  190 
  191         return (mp);
  192 }
  193 
  194 /*
  195  * Release a mount point.
  196  */
  197 void
  198 vfs_mount_free(struct mount *mp)
  199 {
  200         atomic_dec_int(&mp->mnt_vfc->vfc_refcount);
  201         free(mp, M_MOUNT, sizeof(*mp));
  202 }
  203 
  204 /*
  205  * Mark a mount point as busy. Used to synchronize access and to delay
  206  * unmounting.
  207  *
  208  * Default behaviour is to attempt getting a READ lock and in case of an
  209  * ongoing unmount, to wait for it to finish and then return failure.
  210  */
  211 int
  212 vfs_busy(struct mount *mp, int flags)
  213 {
  214         int rwflags = 0;
  215 
  216         if (flags & VB_WRITE)
  217                 rwflags |= RW_WRITE;
  218         else
  219                 rwflags |= RW_READ;
  220 
  221         if (flags & VB_WAIT)
  222                 rwflags |= RW_SLEEPFAIL;
  223         else
  224                 rwflags |= RW_NOSLEEP;
  225 
  226 #ifdef WITNESS
  227         if (flags & VB_DUPOK)
  228                 rwflags |= RW_DUPOK;
  229 #endif
  230 
  231         if (rw_enter(&mp->mnt_lock, rwflags))
  232                 return (EBUSY);
  233 
  234         return (0);
  235 }
  236 
  237 /*
  238  * Free a busy file system
  239  */
  240 void
  241 vfs_unbusy(struct mount *mp)
  242 {
  243         rw_exit(&mp->mnt_lock);
  244 }
  245 
  246 int
  247 vfs_isbusy(struct mount *mp)
  248 {
  249         if (RWLOCK_OWNER(&mp->mnt_lock) > 0)
  250                 return (1);
  251         else
  252                 return (0);
  253 }
  254 
  255 /*
  256  * Lookup a filesystem type, and if found allocate and initialize
  257  * a mount structure for it.
  258  *
  259  * Devname is usually updated by mount(8) after booting.
  260  */
  261 int
  262 vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp)
  263 {
  264         struct vfsconf *vfsp;
  265         struct mount *mp;
  266 
  267         vfsp = vfs_byname(fstypename);
  268         if (vfsp == NULL)
  269                 return (ENODEV);
  270         mp = vfs_mount_alloc(NULLVP, vfsp);
  271         mp->mnt_flag |= MNT_RDONLY;
  272         mp->mnt_stat.f_mntonname[0] = '/';
  273         strlcpy(mp->mnt_stat.f_mntfromname, devname, MNAMELEN);
  274         strlcpy(mp->mnt_stat.f_mntfromspec, devname, MNAMELEN);
  275         *mpp = mp;
  276         return (0);
  277  }
  278 
  279 /*
  280  * Lookup a mount point by filesystem identifier.
  281  */
  282 struct mount *
  283 vfs_getvfs(fsid_t *fsid)
  284 {
  285         struct mount *mp;
  286 
  287         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  288                 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
  289                     mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
  290                         return (mp);
  291                 }
  292         }
  293 
  294         return (NULL);
  295 }
  296 
  297 
  298 /*
  299  * Get a new unique fsid
  300  */
  301 void
  302 vfs_getnewfsid(struct mount *mp)
  303 {
  304         static u_short xxxfs_mntid;
  305 
  306         fsid_t tfsid;
  307         int mtype;
  308 
  309         mtype = mp->mnt_vfc->vfc_typenum;
  310         mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
  311         mp->mnt_stat.f_fsid.val[1] = mtype;
  312         if (xxxfs_mntid == 0)
  313                 ++xxxfs_mntid;
  314         tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
  315         tfsid.val[1] = mtype;
  316         if (!TAILQ_EMPTY(&mountlist)) {
  317                 while (vfs_getvfs(&tfsid)) {
  318                         tfsid.val[0]++;
  319                         xxxfs_mntid++;
  320                 }
  321         }
  322         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
  323 }
  324 
  325 /*
  326  * Set vnode attributes to VNOVAL
  327  */
  328 void
  329 vattr_null(struct vattr *vap)
  330 {
  331 
  332         vap->va_type = VNON;
  333         /*
  334          * Don't get fancy: u_quad_t = u_int = VNOVAL leaves the u_quad_t
  335          * with 2^31-1 instead of 2^64-1.  Just write'm out and let
  336          * the compiler do its job.
  337          */
  338         vap->va_mode = VNOVAL;
  339         vap->va_nlink = VNOVAL;
  340         vap->va_uid = VNOVAL;
  341         vap->va_gid = VNOVAL;
  342         vap->va_fsid = VNOVAL;
  343         vap->va_fileid = VNOVAL;
  344         vap->va_size = VNOVAL;
  345         vap->va_blocksize = VNOVAL;
  346         vap->va_atime.tv_sec = VNOVAL;
  347         vap->va_atime.tv_nsec = VNOVAL;
  348         vap->va_mtime.tv_sec = VNOVAL;
  349         vap->va_mtime.tv_nsec = VNOVAL;
  350         vap->va_ctime.tv_sec = VNOVAL;
  351         vap->va_ctime.tv_nsec = VNOVAL;
  352         vap->va_gen = VNOVAL;
  353         vap->va_flags = VNOVAL;
  354         vap->va_rdev = VNOVAL;
  355         vap->va_bytes = VNOVAL;
  356         vap->va_filerev = VNOVAL;
  357         vap->va_vaflags = 0;
  358 }
  359 
  360 /*
  361  * Routines having to do with the management of the vnode table.
  362  */
  363 long numvnodes;
  364 
  365 /*
  366  * Return the next vnode from the free list.
  367  */
  368 int
  369 getnewvnode(enum vtagtype tag, struct mount *mp, const struct vops *vops,
  370     struct vnode **vpp)
  371 {
  372         struct proc *p = curproc;
  373         struct freelst *listhd;
  374         static int toggle;
  375         struct vnode *vp;
  376         int s;
  377 
  378         /*
  379          * allow maxvnodes to increase if the buffer cache itself
  380          * is big enough to justify it. (we don't shrink it ever)
  381          */
  382         maxvnodes = maxvnodes < bcstats.numbufs ? bcstats.numbufs
  383             : maxvnodes;
  384 
  385         /*
  386          * We must choose whether to allocate a new vnode or recycle an
  387          * existing one. The criterion for allocating a new one is that
  388          * the total number of vnodes is less than the number desired or
  389          * there are no vnodes on either free list. Generally we only
  390          * want to recycle vnodes that have no buffers associated with
  391          * them, so we look first on the vnode_free_list. If it is empty,
  392          * we next consider vnodes with referencing buffers on the
  393          * vnode_hold_list. The toggle ensures that half the time we
  394          * will use a buffer from the vnode_hold_list, and half the time
  395          * we will allocate a new one unless the list has grown to twice
  396          * the desired size. We are reticent to recycle vnodes from the
  397          * vnode_hold_list because we will lose the identity of all its
  398          * referencing buffers.
  399          */
  400         toggle ^= 1;
  401         if (numvnodes / 2 > maxvnodes)
  402                 toggle = 0;
  403 
  404         s = splbio();
  405         if ((numvnodes < maxvnodes) ||
  406             ((TAILQ_FIRST(listhd = &vnode_free_list) == NULL) &&
  407             ((TAILQ_FIRST(listhd = &vnode_hold_list) == NULL) || toggle))) {
  408                 splx(s);
  409                 vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO);
  410                 vp->v_uvm = pool_get(&uvm_vnode_pool, PR_WAITOK | PR_ZERO);
  411                 vp->v_uvm->u_vnode = vp;
  412                 uvm_obj_init(&vp->v_uvm->u_obj, &uvm_vnodeops, 0);
  413                 RBT_INIT(buf_rb_bufs, &vp->v_bufs_tree);
  414                 cache_tree_init(&vp->v_nc_tree);
  415                 TAILQ_INIT(&vp->v_cache_dst);
  416                 numvnodes++;
  417         } else {
  418                 TAILQ_FOREACH(vp, listhd, v_freelist) {
  419                         if (VOP_ISLOCKED(vp) == 0)
  420                                 break;
  421                 }
  422                 /*
  423                  * Unless this is a bad time of the month, at most
  424                  * the first NCPUS items on the free list are
  425                  * locked, so this is close enough to being empty.
  426                  */
  427                 if (vp == NULL) {
  428                         splx(s);
  429                         tablefull("vnode");
  430                         *vpp = NULL;
  431                         return (ENFILE);
  432                 }
  433 
  434 #ifdef DIAGNOSTIC
  435                 if (vp->v_usecount) {
  436                         vprint("free vnode", vp);
  437                         panic("free vnode isn't");
  438                 }
  439 #endif
  440 
  441                 TAILQ_REMOVE(listhd, vp, v_freelist);
  442                 vp->v_bioflag &= ~VBIOONFREELIST;
  443                 splx(s);
  444 
  445                 if (vp->v_type != VBAD)
  446                         vgonel(vp, p);
  447 #ifdef DIAGNOSTIC
  448                 if (vp->v_data) {
  449                         vprint("cleaned vnode", vp);
  450                         panic("cleaned vnode isn't");
  451                 }
  452                 s = splbio();
  453                 if (vp->v_numoutput)
  454                         panic("Clean vnode has pending I/O's");
  455                 splx(s);
  456 #endif
  457                 vp->v_flag = 0;
  458                 vp->v_socket = NULL;
  459         }
  460         cache_purge(vp);
  461         vp->v_type = VNON;
  462         vp->v_tag = tag;
  463         vp->v_op = vops;
  464         insmntque(vp, mp);
  465         *vpp = vp;
  466         vp->v_usecount = 1;
  467         vp->v_data = NULL;
  468         return (0);
  469 }
  470 
  471 /*
  472  * Move a vnode from one mount queue to another.
  473  */
  474 void
  475 insmntque(struct vnode *vp, struct mount *mp)
  476 {
  477         /*
  478          * Delete from old mount point vnode list, if on one.
  479          */
  480         if (vp->v_mount != NULL)
  481                 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
  482         /*
  483          * Insert into list of vnodes for the new mount point, if available.
  484          */
  485         if ((vp->v_mount = mp) != NULL)
  486                 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
  487 }
  488 
  489 /*
  490  * Create a vnode for a block device.
  491  * Used for root filesystem, argdev, and swap areas.
  492  * Also used for memory file system special devices.
  493  */
  494 int
  495 bdevvp(dev_t dev, struct vnode **vpp)
  496 {
  497         return (getdevvp(dev, vpp, VBLK));
  498 }
  499 
  500 /*
  501  * Create a vnode for a character device.
  502  * Used for console handling.
  503  */
  504 int
  505 cdevvp(dev_t dev, struct vnode **vpp)
  506 {
  507         return (getdevvp(dev, vpp, VCHR));
  508 }
  509 
  510 /*
  511  * Create a vnode for a device.
  512  * Used by bdevvp (block device) for root file system etc.,
  513  * and by cdevvp (character device) for console.
  514  */
  515 int
  516 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type)
  517 {
  518         struct vnode *vp;
  519         struct vnode *nvp;
  520         int error;
  521 
  522         if (dev == NODEV) {
  523                 *vpp = NULLVP;
  524                 return (0);
  525         }
  526         error = getnewvnode(VT_NON, NULL, &spec_vops, &nvp);
  527         if (error) {
  528                 *vpp = NULLVP;
  529                 return (error);
  530         }
  531         vp = nvp;
  532         vp->v_type = type;
  533         if ((nvp = checkalias(vp, dev, NULL)) != NULL) {
  534                 vput(vp);
  535                 vp = nvp;
  536         }
  537         if (vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_type == D_TTY)
  538                 vp->v_flag |= VISTTY;
  539         *vpp = vp;
  540         return (0);
  541 }
  542 
  543 /*
  544  * Check to see if the new vnode represents a special device
  545  * for which we already have a vnode (either because of
  546  * bdevvp() or because of a different vnode representing
  547  * the same block device). If such an alias exists, deallocate
  548  * the existing contents and return the aliased vnode. The
  549  * caller is responsible for filling it with its new contents.
  550  */
  551 struct vnode *
  552 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp)
  553 {
  554         struct proc *p = curproc;
  555         struct vnode *vp;
  556         struct vnodechain *vchain;
  557 
  558         if (nvp->v_type != VBLK && nvp->v_type != VCHR)
  559                 return (NULLVP);
  560 
  561         vchain = &speclisth[SPECHASH(nvp_rdev)];
  562 loop:
  563         SLIST_FOREACH(vp, vchain, v_specnext) {
  564                 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) {
  565                         continue;
  566                 }
  567                 /*
  568                  * Alias, but not in use, so flush it out.
  569                  */
  570                 if (vp->v_usecount == 0) {
  571                         vgonel(vp, p);
  572                         goto loop;
  573                 }
  574                 if (vget(vp, LK_EXCLUSIVE)) {
  575                         goto loop;
  576                 }
  577                 break;
  578         }
  579 
  580         /*
  581          * Common case is actually in the if statement
  582          */
  583         if (vp == NULL || !(vp->v_tag == VT_NON && vp->v_type == VBLK)) {
  584                 nvp->v_specinfo = malloc(sizeof(struct specinfo), M_VNODE,
  585                         M_WAITOK);
  586                 nvp->v_rdev = nvp_rdev;
  587                 nvp->v_hashchain = vchain;
  588                 nvp->v_specmountpoint = NULL;
  589                 nvp->v_speclockf = NULL;
  590                 nvp->v_specbitmap = NULL;
  591                 if (nvp->v_type == VCHR &&
  592                     (cdevsw[major(nvp_rdev)].d_flags & D_CLONE) &&
  593                     (minor(nvp_rdev) >> CLONE_SHIFT == 0)) {
  594                         if (vp != NULLVP)
  595                                 nvp->v_specbitmap = vp->v_specbitmap;
  596                         else
  597                                 nvp->v_specbitmap = malloc(CLONE_MAPSZ,
  598                                     M_VNODE, M_WAITOK | M_ZERO);
  599                 }
  600                 SLIST_INSERT_HEAD(vchain, nvp, v_specnext);
  601                 if (vp != NULLVP) {
  602                         nvp->v_flag |= VALIASED;
  603                         vp->v_flag |= VALIASED;
  604                         vput(vp);
  605                 }
  606                 return (NULLVP);
  607         }
  608 
  609         /*
  610          * This code is the uncommon case. It is called in case
  611          * we found an alias that was VT_NON && vtype of VBLK
  612          * This means we found a block device that was created
  613          * using bdevvp.
  614          * An example of such a vnode is the root partition device vnode
  615          * created in ffs_mountroot.
  616          *
  617          * The vnodes created by bdevvp should not be aliased (why?).
  618          */
  619 
  620         VOP_UNLOCK(vp);
  621         vclean(vp, 0, p);
  622         vp->v_op = nvp->v_op;
  623         vp->v_tag = nvp->v_tag;
  624         nvp->v_type = VNON;
  625         insmntque(vp, mp);
  626         return (vp);
  627 }
  628 
  629 /*
  630  * Grab a particular vnode from the free list, increment its
  631  * reference count and lock it. If the vnode lock bit is set,
  632  * the vnode is being eliminated in vgone. In that case, we
  633  * cannot grab it, so the process is awakened when the
  634  * transition is completed, and an error code is returned to
  635  * indicate that the vnode is no longer usable, possibly
  636  * having been changed to a new file system type.
  637  */
  638 int
  639 vget(struct vnode *vp, int flags)
  640 {
  641         int error, s, onfreelist;
  642 
  643         /*
  644          * If the vnode is in the process of being cleaned out for
  645          * another use, we wait for the cleaning to finish and then
  646          * return failure. Cleaning is determined by checking that
  647          * the VXLOCK flag is set.
  648          */
  649         mtx_enter(&vnode_mtx);
  650         if (vp->v_lflag & VXLOCK) {
  651                 if (flags & LK_NOWAIT) {
  652                         mtx_leave(&vnode_mtx);
  653                         return (EBUSY);
  654                 }
  655 
  656                 vp->v_lflag |= VXWANT;
  657                 msleep_nsec(vp, &vnode_mtx, PINOD, "vget", INFSLP);
  658                 mtx_leave(&vnode_mtx);
  659                 return (ENOENT);
  660         }
  661         mtx_leave(&vnode_mtx);
  662 
  663         s = splbio();
  664         onfreelist = vp->v_bioflag & VBIOONFREELIST;
  665         if (vp->v_usecount == 0 && onfreelist) {
  666                 if (vp->v_holdcnt > 0)
  667                         TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
  668                 else
  669                         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
  670                 vp->v_bioflag &= ~VBIOONFREELIST;
  671         }
  672         splx(s);
  673 
  674         vp->v_usecount++;
  675         if (flags & LK_TYPE_MASK) {
  676                 if ((error = vn_lock(vp, flags)) != 0) {
  677                         vp->v_usecount--;
  678                         if (vp->v_usecount == 0 && onfreelist)
  679                                 vputonfreelist(vp);
  680                 }
  681                 return (error);
  682         }
  683 
  684         return (0);
  685 }
  686 
  687 
  688 /* Vnode reference. */
  689 void
  690 vref(struct vnode *vp)
  691 {
  692         KERNEL_ASSERT_LOCKED();
  693 
  694 #ifdef DIAGNOSTIC
  695         if (vp->v_usecount == 0)
  696                 panic("vref used where vget required");
  697         if (vp->v_type == VNON)
  698                 panic("vref on a VNON vnode");
  699 #endif
  700         vp->v_usecount++;
  701 }
  702 
  703 void
  704 vputonfreelist(struct vnode *vp)
  705 {
  706         int s;
  707         struct freelst *lst;
  708 
  709         s = splbio();
  710 #ifdef DIAGNOSTIC
  711         if (vp->v_usecount != 0)
  712                 panic("Use count is not zero!");
  713 
  714         /*
  715          * If the hold count is still positive, one or many threads could still
  716          * be waiting on the vnode lock inside uvn_io().
  717          */
  718         if (vp->v_holdcnt == 0 && vp->v_lockcount != 0)
  719                 panic("%s: lock count is not zero", __func__);
  720 
  721         if (vp->v_bioflag & VBIOONFREELIST) {
  722                 vprint("vnode already on free list: ", vp);
  723                 panic("vnode already on free list");
  724         }
  725 #endif
  726 
  727         vp->v_bioflag |= VBIOONFREELIST;
  728         vp->v_bioflag &= ~VBIOERROR;
  729 
  730         if (vp->v_holdcnt > 0)
  731                 lst = &vnode_hold_list;
  732         else
  733                 lst = &vnode_free_list;
  734 
  735         if (vp->v_type == VBAD)
  736                 TAILQ_INSERT_HEAD(lst, vp, v_freelist);
  737         else
  738                 TAILQ_INSERT_TAIL(lst, vp, v_freelist);
  739 
  740         splx(s);
  741 }
  742 
  743 /*
  744  * vput(), just unlock and vrele()
  745  */
  746 void
  747 vput(struct vnode *vp)
  748 {
  749         struct proc *p = curproc;
  750         int s;
  751 
  752 #ifdef DIAGNOSTIC
  753         if (vp == NULL)
  754                 panic("vput: null vp");
  755 #endif
  756 
  757 #ifdef DIAGNOSTIC
  758         if (vp->v_usecount == 0) {
  759                 vprint("vput: bad ref count", vp);
  760                 panic("vput: ref cnt");
  761         }
  762 #endif
  763         vp->v_usecount--;
  764         KASSERT(vp->v_usecount > 0 || vp->v_uvcount == 0);
  765         if (vp->v_usecount > 0) {
  766                 VOP_UNLOCK(vp);
  767                 return;
  768         }
  769 
  770 #ifdef DIAGNOSTIC
  771         if (vp->v_writecount != 0) {
  772                 vprint("vput: bad writecount", vp);
  773                 panic("vput: v_writecount != 0");
  774         }
  775 #endif
  776 
  777         VOP_INACTIVE(vp, p);
  778 
  779         s = splbio();
  780         if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
  781                 vputonfreelist(vp);
  782         splx(s);
  783 }
  784 
  785 /*
  786  * Vnode release - use for active VNODES.
  787  * If count drops to zero, call inactive routine and return to freelist.
  788  * Returns 0 if it did not sleep.
  789  */
  790 int
  791 vrele(struct vnode *vp)
  792 {
  793         struct proc *p = curproc;
  794         int s;
  795 
  796 #ifdef DIAGNOSTIC
  797         if (vp == NULL)
  798                 panic("vrele: null vp");
  799 #endif
  800 #ifdef DIAGNOSTIC
  801         if (vp->v_usecount == 0) {
  802                 vprint("vrele: bad ref count", vp);
  803                 panic("vrele: ref cnt");
  804         }
  805 #endif
  806         vp->v_usecount--;
  807         if (vp->v_usecount > 0) {
  808                 return (0);
  809         }
  810 
  811 #ifdef DIAGNOSTIC
  812         if (vp->v_writecount != 0) {
  813                 vprint("vrele: bad writecount", vp);
  814                 panic("vrele: v_writecount != 0");
  815         }
  816 #endif
  817 
  818         if (vn_lock(vp, LK_EXCLUSIVE)) {
  819 #ifdef DIAGNOSTIC
  820                 vprint("vrele: cannot lock", vp);
  821 #endif
  822                 return (1);
  823         }
  824 
  825         VOP_INACTIVE(vp, p);
  826 
  827         s = splbio();
  828         if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
  829                 vputonfreelist(vp);
  830         splx(s);
  831         return (1);
  832 }
  833 
  834 /* Page or buffer structure gets a reference. */
  835 void
  836 vhold(struct vnode *vp)
  837 {
  838         int s;
  839 
  840         s = splbio();
  841 
  842         /*
  843          * If it is on the freelist and the hold count is currently
  844          * zero, move it to the hold list.
  845          */
  846         if ((vp->v_bioflag & VBIOONFREELIST) &&
  847             vp->v_holdcnt == 0 && vp->v_usecount == 0) {
  848                 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
  849                 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
  850         }
  851         vp->v_holdcnt++;
  852 
  853         splx(s);
  854 }
  855 
  856 /* Lose interest in a vnode. */
  857 void
  858 vdrop(struct vnode *vp)
  859 {
  860         int s;
  861 
  862         s = splbio();
  863 
  864 #ifdef DIAGNOSTIC
  865         if (vp->v_holdcnt == 0)
  866                 panic("vdrop: zero holdcnt");
  867 #endif
  868 
  869         vp->v_holdcnt--;
  870 
  871         /*
  872          * If it is on the holdlist and the hold count drops to
  873          * zero, move it to the free list.
  874          */
  875         if ((vp->v_bioflag & VBIOONFREELIST) &&
  876             vp->v_holdcnt == 0 && vp->v_usecount == 0) {
  877                 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
  878                 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
  879         }
  880 
  881         splx(s);
  882 }
  883 
  884 /*
  885  * Remove any vnodes in the vnode table belonging to mount point mp.
  886  *
  887  * If MNT_NOFORCE is specified, there should not be any active ones,
  888  * return error if any are found (nb: this is a user error, not a
  889  * system error). If MNT_FORCE is specified, detach any active vnodes
  890  * that are found.
  891  */
  892 #ifdef DEBUG_SYSCTL
  893 int busyprt = 0;        /* print out busy vnodes */
  894 struct ctldebug debug_vfs_busyprt = { "vfs_busyprt", &busyprt };
  895 #endif
  896 
  897 int
  898 vfs_mount_foreach_vnode(struct mount *mp,
  899     int (*func)(struct vnode *, void *), void *arg) {
  900         struct vnode *vp, *nvp;
  901         int error = 0;
  902 
  903 loop:
  904         TAILQ_FOREACH_SAFE(vp , &mp->mnt_vnodelist, v_mntvnodes, nvp) {
  905                 if (vp->v_mount != mp)
  906                         goto loop;
  907 
  908                 error = func(vp, arg);
  909 
  910                 if (error != 0)
  911                         break;
  912         }
  913 
  914         return (error);
  915 }
  916 
  917 struct vflush_args {
  918         struct vnode *skipvp;
  919         int busy;
  920         int flags;
  921 };
  922 
  923 int
  924 vflush_vnode(struct vnode *vp, void *arg)
  925 {
  926         struct vflush_args *va = arg;
  927         struct proc *p = curproc;
  928         int empty, s;
  929 
  930         if (vp == va->skipvp) {
  931                 return (0);
  932         }
  933 
  934         if ((va->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
  935                 return (0);
  936         }
  937 
  938         /*
  939          * If WRITECLOSE is set, only flush out regular file
  940          * vnodes open for writing.
  941          */
  942         if ((va->flags & WRITECLOSE) &&
  943             (vp->v_writecount == 0 || vp->v_type != VREG)) {
  944                 return (0);
  945         }
  946 
  947         /*
  948          * With v_usecount == 0, all we need to do is clear
  949          * out the vnode data structures and we are done.
  950          */
  951         if (vp->v_usecount == 0) {
  952                 vgonel(vp, p);
  953                 return (0);
  954         }
  955 
  956         /*
  957          * If FORCECLOSE is set, forcibly close the vnode.
  958          * For block or character devices, revert to an
  959          * anonymous device. For all other files, just kill them.
  960          */
  961         if (va->flags & FORCECLOSE) {
  962                 if (vp->v_type != VBLK && vp->v_type != VCHR) {
  963                         vgonel(vp, p);
  964                 } else {
  965                         vclean(vp, 0, p);
  966                         vp->v_op = &spec_vops;
  967                         insmntque(vp, NULL);
  968                 }
  969                 return (0);
  970         }
  971 
  972         /*
  973          * If set, this is allowed to ignore vnodes which don't
  974          * have changes pending to disk.
  975          * XXX Might be nice to check per-fs "inode" flags, but
  976          * generally the filesystem is sync'd already, right?
  977          */
  978         s = splbio();
  979         empty = (va->flags & IGNORECLEAN) && LIST_EMPTY(&vp->v_dirtyblkhd);
  980         splx(s);
  981 
  982         if (empty)
  983                 return (0);
  984 
  985 #ifdef DEBUG_SYSCTL
  986         if (busyprt)
  987                 vprint("vflush: busy vnode", vp);
  988 #endif
  989         va->busy++;
  990         return (0);
  991 }
  992 
  993 int
  994 vflush(struct mount *mp, struct vnode *skipvp, int flags)
  995 {
  996         struct vflush_args va;
  997         va.skipvp = skipvp;
  998         va.busy = 0;
  999         va.flags = flags;
 1000 
 1001         vfs_mount_foreach_vnode(mp, vflush_vnode, &va);
 1002 
 1003         if (va.busy)
 1004                 return (EBUSY);
 1005         return (0);
 1006 }
 1007 
 1008 /*
 1009  * Disassociate the underlying file system from a vnode.
 1010  */
 1011 void
 1012 vclean(struct vnode *vp, int flags, struct proc *p)
 1013 {
 1014         int active, do_wakeup = 0;
 1015         int s;
 1016 
 1017         /*
 1018          * Check to see if the vnode is in use.
 1019          * If so we have to reference it before we clean it out
 1020          * so that its count cannot fall to zero and generate a
 1021          * race against ourselves to recycle it.
 1022          */
 1023         if ((active = vp->v_usecount) != 0)
 1024                 vp->v_usecount++;
 1025 
 1026         /*
 1027          * Prevent the vnode from being recycled or
 1028          * brought into use while we clean it out.
 1029          */
 1030         mtx_enter(&vnode_mtx);
 1031         if (vp->v_lflag & VXLOCK)
 1032                 panic("vclean: deadlock");
 1033         vp->v_lflag |= VXLOCK;
 1034 
 1035         if (vp->v_lockcount > 0) {
 1036                 /*
 1037                  * Ensure that any thread currently waiting on the same lock has
 1038                  * observed that the vnode is about to be exclusively locked
 1039                  * before continuing.
 1040                  */
 1041                 msleep_nsec(&vp->v_lockcount, &vnode_mtx, PINOD, "vop_lock",
 1042                     INFSLP);
 1043                 KASSERT(vp->v_lockcount == 0);
 1044         }
 1045         mtx_leave(&vnode_mtx);
 1046 
 1047         /*
 1048          * Even if the count is zero, the VOP_INACTIVE routine may still
 1049          * have the object locked while it cleans it out. The VOP_LOCK
 1050          * ensures that the VOP_INACTIVE routine is done with its work.
 1051          * For active vnodes, it ensures that no other activity can
 1052          * occur while the underlying object is being cleaned out.
 1053          */
 1054         VOP_LOCK(vp, LK_EXCLUSIVE | LK_DRAIN);
 1055 
 1056         /*
 1057          * Clean out any VM data associated with the vnode.
 1058          */
 1059         uvm_vnp_terminate(vp);
 1060         /*
 1061          * Clean out any buffers associated with the vnode.
 1062          */
 1063         if (flags & DOCLOSE)
 1064                 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, INFSLP);
 1065         /*
 1066          * If purging an active vnode, it must be closed and
 1067          * deactivated before being reclaimed. Note that the
 1068          * VOP_INACTIVE will unlock the vnode
 1069          */
 1070         if (active) {
 1071                 if (flags & DOCLOSE)
 1072                         VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
 1073                 VOP_INACTIVE(vp, p);
 1074         } else {
 1075                 /*
 1076                  * Any other processes trying to obtain this lock must first
 1077                  * wait for VXLOCK to clear, then call the new lock operation.
 1078                  */
 1079                 VOP_UNLOCK(vp);
 1080         }
 1081 
 1082         /*
 1083          * Reclaim the vnode.
 1084          */
 1085         if (VOP_RECLAIM(vp, p))
 1086                 panic("vclean: cannot reclaim");
 1087         if (active) {
 1088                 vp->v_usecount--;
 1089                 if (vp->v_usecount == 0) {
 1090                         s = splbio();
 1091                         if (vp->v_holdcnt > 0)
 1092                                 panic("vclean: not clean");
 1093                         vputonfreelist(vp);
 1094                         splx(s);
 1095                 }
 1096         }
 1097         cache_purge(vp);
 1098 
 1099         /*
 1100          * Done with purge, notify sleepers of the grim news.
 1101          */
 1102         vp->v_op = &dead_vops;
 1103         VN_KNOTE(vp, NOTE_REVOKE);
 1104         vp->v_tag = VT_NON;
 1105 #ifdef VFSLCKDEBUG
 1106         vp->v_flag &= ~VLOCKSWORK;
 1107 #endif
 1108         mtx_enter(&vnode_mtx);
 1109         vp->v_lflag &= ~VXLOCK;
 1110         if (vp->v_lflag & VXWANT) {
 1111                 vp->v_lflag &= ~VXWANT;
 1112                 do_wakeup = 1;
 1113         }
 1114         mtx_leave(&vnode_mtx);
 1115         if (do_wakeup)
 1116                 wakeup(vp);
 1117 }
 1118 
 1119 /*
 1120  * Recycle an unused vnode to the front of the free list.
 1121  */
 1122 int
 1123 vrecycle(struct vnode *vp, struct proc *p)
 1124 {
 1125         if (vp->v_usecount == 0) {
 1126                 vgonel(vp, p);
 1127                 return (1);
 1128         }
 1129         return (0);
 1130 }
 1131 
 1132 /*
 1133  * Eliminate all activity associated with a vnode
 1134  * in preparation for reuse.
 1135  */
 1136 void
 1137 vgone(struct vnode *vp)
 1138 {
 1139         struct proc *p = curproc;
 1140         vgonel(vp, p);
 1141 }
 1142 
 1143 /*
 1144  * vgone, with struct proc.
 1145  */
 1146 void
 1147 vgonel(struct vnode *vp, struct proc *p)
 1148 {
 1149         struct vnode *vq;
 1150         struct vnode *vx;
 1151         int s;
 1152 
 1153         KASSERT(vp->v_uvcount == 0);
 1154 
 1155         /*
 1156          * If a vgone (or vclean) is already in progress,
 1157          * wait until it is done and return.
 1158          */
 1159         mtx_enter(&vnode_mtx);
 1160         if (vp->v_lflag & VXLOCK) {
 1161                 vp->v_lflag |= VXWANT;
 1162                 msleep_nsec(vp, &vnode_mtx, PINOD, "vgone", INFSLP);
 1163                 mtx_leave(&vnode_mtx);
 1164                 return;
 1165         }
 1166         mtx_leave(&vnode_mtx);
 1167 
 1168         /*
 1169          * Clean out the filesystem specific data.
 1170          */
 1171         vclean(vp, DOCLOSE, p);
 1172         /*
 1173          * Delete from old mount point vnode list, if on one.
 1174          */
 1175         if (vp->v_mount != NULL)
 1176                 insmntque(vp, NULL);
 1177         /*
 1178          * If special device, remove it from special device alias list
 1179          * if it is on one.
 1180          */
 1181         if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
 1182             vp->v_specinfo != NULL) {
 1183                 if ((vp->v_flag & VALIASED) == 0 && vp->v_type == VCHR &&
 1184                     (cdevsw[major(vp->v_rdev)].d_flags & D_CLONE) &&
 1185                     (minor(vp->v_rdev) >> CLONE_SHIFT == 0)) {
 1186                         free(vp->v_specbitmap, M_VNODE, CLONE_MAPSZ);
 1187                 }
 1188                 SLIST_REMOVE(vp->v_hashchain, vp, vnode, v_specnext);
 1189                 if (vp->v_flag & VALIASED) {
 1190                         vx = NULL;
 1191                         SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
 1192                                 if (vq->v_rdev != vp->v_rdev ||
 1193                                     vq->v_type != vp->v_type)
 1194                                         continue;
 1195                                 if (vx)
 1196                                         break;
 1197                                 vx = vq;
 1198                         }
 1199                         if (vx == NULL)
 1200                                 panic("missing alias");
 1201                         if (vq == NULL)
 1202                                 vx->v_flag &= ~VALIASED;
 1203                         vp->v_flag &= ~VALIASED;
 1204                 }
 1205                 lf_purgelocks(&vp->v_speclockf);
 1206                 free(vp->v_specinfo, M_VNODE, sizeof(struct specinfo));
 1207                 vp->v_specinfo = NULL;
 1208         }
 1209         /*
 1210          * If it is on the freelist and not already at the head,
 1211          * move it to the head of the list.
 1212          */
 1213         vp->v_type = VBAD;
 1214 
 1215         /*
 1216          * Move onto the free list, unless we were called from
 1217          * getnewvnode and we're not on any free list
 1218          */
 1219         s = splbio();
 1220         if (vp->v_usecount == 0 &&
 1221             (vp->v_bioflag & VBIOONFREELIST)) {
 1222                 if (vp->v_holdcnt > 0)
 1223                         panic("vgonel: not clean");
 1224 
 1225                 if (TAILQ_FIRST(&vnode_free_list) != vp) {
 1226                         TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 1227                         TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 1228                 }
 1229         }
 1230         splx(s);
 1231 }
 1232 
 1233 /*
 1234  * Lookup a vnode by device number.
 1235  */
 1236 int
 1237 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
 1238 {
 1239         struct vnode *vp;
 1240         int rc =0;
 1241 
 1242         SLIST_FOREACH(vp, &speclisth[SPECHASH(dev)], v_specnext) {
 1243                 if (dev != vp->v_rdev || type != vp->v_type)
 1244                         continue;
 1245                 *vpp = vp;
 1246                 rc = 1;
 1247                 break;
 1248         }
 1249         return (rc);
 1250 }
 1251 
 1252 /*
 1253  * Revoke all the vnodes corresponding to the specified minor number
 1254  * range (endpoints inclusive) of the specified major.
 1255  */
 1256 void
 1257 vdevgone(int maj, int minl, int minh, enum vtype type)
 1258 {
 1259         struct vnode *vp;
 1260         int mn;
 1261 
 1262         for (mn = minl; mn <= minh; mn++)
 1263                 if (vfinddev(makedev(maj, mn), type, &vp))
 1264                         VOP_REVOKE(vp, REVOKEALL);
 1265 }
 1266 
 1267 /*
 1268  * Calculate the total number of references to a special device.
 1269  */
 1270 int
 1271 vcount(struct vnode *vp)
 1272 {
 1273         struct vnode *vq;
 1274         int count;
 1275 
 1276 loop:
 1277         if ((vp->v_flag & VALIASED) == 0)
 1278                 return (vp->v_usecount);
 1279         count = 0;
 1280         SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
 1281                 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
 1282                         continue;
 1283                 /*
 1284                  * Alias, but not in use, so flush it out.
 1285                  */
 1286                 if (vq->v_usecount == 0 && vq != vp) {
 1287                         vgone(vq);
 1288                         goto loop;
 1289                 }
 1290                 count += vq->v_usecount;
 1291         }
 1292         return (count);
 1293 }
 1294 
 1295 #if defined(DEBUG) || defined(DIAGNOSTIC)
 1296 /*
 1297  * Print out a description of a vnode.
 1298  */
 1299 static char *typename[] =
 1300    { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
 1301 
 1302 void
 1303 vprint(char *label, struct vnode *vp)
 1304 {
 1305         char buf[64];
 1306 
 1307         if (label != NULL)
 1308                 printf("%s: ", label);
 1309         printf("%p, type %s, use %u, write %u, hold %u,",
 1310                 vp, typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 1311                 vp->v_holdcnt);
 1312         buf[0] = '\0';
 1313         if (vp->v_flag & VROOT)
 1314                 strlcat(buf, "|VROOT", sizeof buf);
 1315         if (vp->v_flag & VTEXT)
 1316                 strlcat(buf, "|VTEXT", sizeof buf);
 1317         if (vp->v_flag & VSYSTEM)
 1318                 strlcat(buf, "|VSYSTEM", sizeof buf);
 1319         if (vp->v_lflag & VXLOCK)
 1320                 strlcat(buf, "|VXLOCK", sizeof buf);
 1321         if (vp->v_lflag & VXWANT)
 1322                 strlcat(buf, "|VXWANT", sizeof buf);
 1323         if (vp->v_bioflag & VBIOWAIT)
 1324                 strlcat(buf, "|VBIOWAIT", sizeof buf);
 1325         if (vp->v_bioflag & VBIOONFREELIST)
 1326                 strlcat(buf, "|VBIOONFREELIST", sizeof buf);
 1327         if (vp->v_bioflag & VBIOONSYNCLIST)
 1328                 strlcat(buf, "|VBIOONSYNCLIST", sizeof buf);
 1329         if (vp->v_flag & VALIASED)
 1330                 strlcat(buf, "|VALIASED", sizeof buf);
 1331         if (buf[0] != '\0')
 1332                 printf(" flags (%s)", &buf[1]);
 1333         if (vp->v_data == NULL) {
 1334                 printf("\n");
 1335         } else {
 1336                 printf("\n\t");
 1337                 VOP_PRINT(vp);
 1338         }
 1339 }
 1340 #endif /* DEBUG || DIAGNOSTIC */
 1341 
 1342 #ifdef DEBUG
 1343 /*
 1344  * List all of the locked vnodes in the system.
 1345  * Called when debugging the kernel.
 1346  */
 1347 void
 1348 printlockedvnodes(void)
 1349 {
 1350         struct mount *mp;
 1351         struct vnode *vp;
 1352 
 1353         printf("Locked vnodes\n");
 1354 
 1355         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 1356                 if (vfs_busy(mp, VB_READ|VB_NOWAIT))
 1357                         continue;
 1358                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 1359                         if (VOP_ISLOCKED(vp))
 1360                                 vprint(NULL, vp);
 1361                 }
 1362                 vfs_unbusy(mp);
 1363         }
 1364 
 1365 }
 1366 #endif
 1367 
 1368 /*
 1369  * Top level filesystem related information gathering.
 1370  */
 1371 int
 1372 vfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
 1373     size_t newlen, struct proc *p)
 1374 {
 1375         struct vfsconf *vfsp, *tmpvfsp;
 1376         int ret;
 1377 
 1378         /* all sysctl names at this level are at least name and field */
 1379         if (namelen < 2)
 1380                 return (ENOTDIR);               /* overloaded */
 1381 
 1382         if (name[0] != VFS_GENERIC) {
 1383                 vfsp = vfs_bytypenum(name[0]);
 1384                 if (vfsp == NULL || vfsp->vfc_vfsops->vfs_sysctl == NULL)
 1385                         return (EOPNOTSUPP);
 1386 
 1387                 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
 1388                     oldp, oldlenp, newp, newlen, p));
 1389         }
 1390 
 1391         switch (name[1]) {
 1392         case VFS_MAXTYPENUM:
 1393                 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
 1394 
 1395         case VFS_CONF:
 1396                 if (namelen < 3)
 1397                         return (ENOTDIR);       /* overloaded */
 1398 
 1399                 vfsp = vfs_bytypenum(name[2]);
 1400                 if (vfsp == NULL)
 1401                         return (EOPNOTSUPP);
 1402 
 1403                 /* Make a copy, clear out kernel pointers */
 1404                 tmpvfsp = malloc(sizeof(*tmpvfsp), M_TEMP, M_WAITOK|M_ZERO);
 1405                 memcpy(tmpvfsp, vfsp, sizeof(*tmpvfsp));
 1406                 tmpvfsp->vfc_vfsops = NULL;
 1407 
 1408                 ret = sysctl_rdstruct(oldp, oldlenp, newp, tmpvfsp,
 1409                     sizeof(struct vfsconf));
 1410 
 1411                 free(tmpvfsp, M_TEMP, sizeof(*tmpvfsp));
 1412                 return (ret);
 1413         case VFS_BCACHESTAT:    /* buffer cache statistics */
 1414                 ret = sysctl_rdstruct(oldp, oldlenp, newp, &bcstats,
 1415                     sizeof(struct bcachestats));
 1416                 return(ret);
 1417         }
 1418         return (EOPNOTSUPP);
 1419 }
 1420 
 1421 /*
 1422  * Check to see if a filesystem is mounted on a block device.
 1423  */
 1424 int
 1425 vfs_mountedon(struct vnode *vp)
 1426 {
 1427         struct vnode *vq;
 1428         int error = 0;
 1429 
 1430         if (vp->v_specmountpoint != NULL)
 1431                 return (EBUSY);
 1432         if (vp->v_flag & VALIASED) {
 1433                 SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
 1434                         if (vq->v_rdev != vp->v_rdev ||
 1435                             vq->v_type != vp->v_type)
 1436                                 continue;
 1437                         if (vq->v_specmountpoint != NULL) {
 1438                                 error = EBUSY;
 1439                                 break;
 1440                         }
 1441                 }
 1442         }
 1443         return (error);
 1444 }
 1445 
 1446 #ifdef NFSSERVER
 1447 /*
 1448  * Build hash lists of net addresses and hang them off the mount point.
 1449  * Called by vfs_export() to set up the lists of export addresses.
 1450  */
 1451 int
 1452 vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
 1453     struct export_args *argp)
 1454 {
 1455         struct netcred *np;
 1456         struct radix_node_head *rnh;
 1457         int nplen, i;
 1458         struct radix_node *rn;
 1459         struct sockaddr *saddr, *smask = NULL;
 1460         int error;
 1461 
 1462         if (argp->ex_addrlen == 0) {
 1463                 if (mp->mnt_flag & MNT_DEFEXPORTED)
 1464                         return (EPERM);
 1465                 np = &nep->ne_defexported;
 1466                 /* fill in the kernel's ucred from userspace's xucred */
 1467                 if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
 1468                         return (error);
 1469                 mp->mnt_flag |= MNT_DEFEXPORTED;
 1470                 goto finish;
 1471         }
 1472         if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN ||
 1473             argp->ex_addrlen < 0 || argp->ex_masklen < 0)
 1474                 return (EINVAL);
 1475         nplen = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
 1476         np = (struct netcred *)malloc(nplen, M_NETADDR, M_WAITOK|M_ZERO);
 1477         np->netc_len = nplen;
 1478         saddr = (struct sockaddr *)(np + 1);
 1479         error = copyin(argp->ex_addr, saddr, argp->ex_addrlen);
 1480         if (error)
 1481                 goto out;
 1482         if (saddr->sa_len > argp->ex_addrlen)
 1483                 saddr->sa_len = argp->ex_addrlen;
 1484         if (argp->ex_masklen) {
 1485                 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
 1486                 error = copyin(argp->ex_mask, smask, argp->ex_masklen);
 1487                 if (error)
 1488                         goto out;
 1489                 if (smask->sa_len > argp->ex_masklen)
 1490                         smask->sa_len = argp->ex_masklen;
 1491         }
 1492         /* fill in the kernel's ucred from userspace's xucred */
 1493         if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
 1494                 goto out;
 1495         i = saddr->sa_family;
 1496         switch (i) {
 1497         case AF_INET:
 1498                 if ((rnh = nep->ne_rtable_inet) == NULL) {
 1499                         if (!rn_inithead((void **)&nep->ne_rtable_inet,
 1500                             offsetof(struct sockaddr_in, sin_addr))) {
 1501                                 error = ENOBUFS;
 1502                                 goto out;
 1503                         }
 1504                         rnh = nep->ne_rtable_inet;
 1505                 }
 1506                 break;
 1507         default:
 1508                 error = EINVAL;
 1509                 goto out;
 1510         }
 1511         rn = rn_addroute(saddr, smask, rnh, np->netc_rnodes, 0);
 1512         if (rn == NULL || np != (struct netcred *)rn) { /* already exists */
 1513                 error = EPERM;
 1514                 goto out;
 1515         }
 1516 finish:
 1517         np->netc_exflags = argp->ex_flags;
 1518         return (0);
 1519 out:
 1520         free(np, M_NETADDR, np->netc_len);
 1521         return (error);
 1522 }
 1523 
 1524 int
 1525 vfs_free_netcred(struct radix_node *rn, void *w, u_int id)
 1526 {
 1527         struct radix_node_head *rnh = (struct radix_node_head *)w;
 1528         struct netcred * np = (struct netcred *)rn;
 1529 
 1530         rn_delete(rn->rn_key, rn->rn_mask, rnh, NULL);
 1531         free(np, M_NETADDR, np->netc_len);
 1532         return (0);
 1533 }
 1534 
 1535 /*
 1536  * Free the net address hash lists that are hanging off the mount points.
 1537  */
 1538 void
 1539 vfs_free_addrlist(struct netexport *nep)
 1540 {
 1541         struct radix_node_head *rnh;
 1542 
 1543         if ((rnh = nep->ne_rtable_inet) != NULL) {
 1544                 rn_walktree(rnh, vfs_free_netcred, rnh);
 1545                 free(rnh, M_RTABLE, sizeof(*rnh));
 1546                 nep->ne_rtable_inet = NULL;
 1547         }
 1548 }
 1549 #endif /* NFSSERVER */
 1550 
 1551 int
 1552 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp)
 1553 {
 1554 #ifdef NFSSERVER
 1555         int error;
 1556 
 1557         if (argp->ex_flags & MNT_DELEXPORT) {
 1558                 vfs_free_addrlist(nep);
 1559                 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 1560         }
 1561         if (argp->ex_flags & MNT_EXPORTED) {
 1562                 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
 1563                         return (error);
 1564                 mp->mnt_flag |= MNT_EXPORTED;
 1565         }
 1566         return (0);
 1567 #else
 1568         return (ENOTSUP);
 1569 #endif /* NFSSERVER */
 1570 }
 1571 
 1572 struct netcred *
 1573 vfs_export_lookup(struct mount *mp, struct netexport *nep, struct mbuf *nam)
 1574 {
 1575 #ifdef NFSSERVER
 1576         struct netcred *np;
 1577         struct radix_node_head *rnh;
 1578         struct sockaddr *saddr;
 1579 
 1580         np = NULL;
 1581         if (mp->mnt_flag & MNT_EXPORTED) {
 1582                 /*
 1583                  * Lookup in the export list first.
 1584                  */
 1585                 if (nam != NULL) {
 1586                         saddr = mtod(nam, struct sockaddr *);
 1587                         switch(saddr->sa_family) {
 1588                         case AF_INET:
 1589                                 rnh = nep->ne_rtable_inet;
 1590                                 break;
 1591                         default:
 1592                                 rnh = NULL;
 1593                                 break;
 1594                         }
 1595                         if (rnh != NULL)
 1596                                 np = (struct netcred *)rn_match(saddr, rnh);
 1597                 }
 1598                 /*
 1599                  * If no address match, use the default if it exists.
 1600                  */
 1601                 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
 1602                         np = &nep->ne_defexported;
 1603         }
 1604         return (np);
 1605 #else
 1606         return (NULL);
 1607 #endif /* NFSSERVER */
 1608 }
 1609 
 1610 /*
 1611  * Do the usual access checking.
 1612  * file_mode, uid and gid are from the vnode in question,
 1613  * while acc_mode and cred are from the VOP_ACCESS parameter list
 1614  */
 1615 int
 1616 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
 1617     mode_t acc_mode, struct ucred *cred)
 1618 {
 1619         mode_t mask;
 1620 
 1621         /* User id 0 always gets read/write access. */
 1622         if (cred->cr_uid == 0) {
 1623                 /* For VEXEC, at least one of the execute bits must be set. */
 1624                 if ((acc_mode & VEXEC) && type != VDIR &&
 1625                     (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
 1626                         return EACCES;
 1627                 return 0;
 1628         }
 1629 
 1630         mask = 0;
 1631 
 1632         /* Otherwise, check the owner. */
 1633         if (cred->cr_uid == uid) {
 1634                 if (acc_mode & VEXEC)
 1635                         mask |= S_IXUSR;
 1636                 if (acc_mode & VREAD)
 1637                         mask |= S_IRUSR;
 1638                 if (acc_mode & VWRITE)
 1639                         mask |= S_IWUSR;
 1640                 return (file_mode & mask) == mask ? 0 : EACCES;
 1641         }
 1642 
 1643         /* Otherwise, check the groups. */
 1644         if (groupmember(gid, cred)) {
 1645                 if (acc_mode & VEXEC)
 1646                         mask |= S_IXGRP;
 1647                 if (acc_mode & VREAD)
 1648                         mask |= S_IRGRP;
 1649                 if (acc_mode & VWRITE)
 1650                         mask |= S_IWGRP;
 1651                 return (file_mode & mask) == mask ? 0 : EACCES;
 1652         }
 1653 
 1654         /* Otherwise, check everyone else. */
 1655         if (acc_mode & VEXEC)
 1656                 mask |= S_IXOTH;
 1657         if (acc_mode & VREAD)
 1658                 mask |= S_IROTH;
 1659         if (acc_mode & VWRITE)
 1660                 mask |= S_IWOTH;
 1661         return (file_mode & mask) == mask ? 0 : EACCES;
 1662 }
 1663 
 1664 int
 1665 vnoperm(struct vnode *vp)
 1666 {
 1667         if (vp->v_flag & VROOT || vp->v_mount == NULL)
 1668                 return 0;
 1669 
 1670         return (vp->v_mount->mnt_flag & MNT_NOPERM);
 1671 }
 1672 
 1673 struct rwlock vfs_stall_lock = RWLOCK_INITIALIZER("vfs_stall");
 1674 unsigned int vfs_stalling = 0;
 1675 
 1676 int
 1677 vfs_stall(struct proc *p, int stall)
 1678 {
 1679         struct mount *mp;
 1680         int allerror = 0, error;
 1681 
 1682         if (stall) {
 1683                 atomic_inc_int(&vfs_stalling);
 1684                 rw_enter_write(&vfs_stall_lock);
 1685         }
 1686 
 1687         /*
 1688          * The loop variable mp is protected by vfs_busy() so that it cannot
 1689          * be unmounted while VFS_SYNC() sleeps.  Traverse forward to keep the
 1690          * lock order consistent with dounmount().
 1691          */
 1692         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 1693                 if (stall) {
 1694                         error = vfs_busy(mp, VB_WRITE|VB_WAIT|VB_DUPOK);
 1695                         if (error) {
 1696                                 printf("%s: busy\n", mp->mnt_stat.f_mntonname);
 1697                                 allerror = error;
 1698                                 continue;
 1699                         }
 1700                         uvm_vnp_sync(mp);
 1701                         error = VFS_SYNC(mp, MNT_WAIT, stall, p->p_ucred, p);
 1702                         if (error) {
 1703                                 printf("%s: failed to sync\n",
 1704                                     mp->mnt_stat.f_mntonname);
 1705                                 vfs_unbusy(mp);
 1706                                 allerror = error;
 1707                                 continue;
 1708                         }
 1709                         mp->mnt_flag |= MNT_STALLED;
 1710                 } else {
 1711                         if (mp->mnt_flag & MNT_STALLED) {
 1712                                 vfs_unbusy(mp);
 1713                                 mp->mnt_flag &= ~MNT_STALLED;
 1714                         }
 1715                 }
 1716         }
 1717 
 1718         if (!stall) {
 1719                 rw_exit_write(&vfs_stall_lock);
 1720                 atomic_dec_int(&vfs_stalling);
 1721         }
 1722 
 1723         return (allerror);
 1724 }
 1725 
 1726 void
 1727 vfs_stall_barrier(void)
 1728 {
 1729         if (__predict_false(vfs_stalling)) {
 1730                 rw_enter_read(&vfs_stall_lock);
 1731                 rw_exit_read(&vfs_stall_lock);
 1732         }
 1733 }
 1734 
 1735 /*
 1736  * Unmount all file systems.
 1737  * We traverse the list in reverse order under the assumption that doing so
 1738  * will avoid needing to worry about dependencies.
 1739  */
 1740 void
 1741 vfs_unmountall(void)
 1742 {
 1743         struct mount *mp, *nmp;
 1744         int allerror, error, again = 1;
 1745 
 1746  retry:
 1747         allerror = 0;
 1748         TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, nmp) {
 1749                 if (vfs_busy(mp, VB_WRITE|VB_NOWAIT))
 1750                         continue;
 1751                 /* XXX Here is a race, the next pointer is not locked. */
 1752                 if ((error = dounmount(mp, MNT_FORCE, curproc)) != 0) {
 1753                         printf("unmount of %s failed with error %d\n",
 1754                             mp->mnt_stat.f_mntonname, error);
 1755                         allerror = 1;
 1756                 }
 1757         }
 1758 
 1759         if (allerror) {
 1760                 printf("WARNING: some file systems would not unmount\n");
 1761                 if (again) {
 1762                         printf("retrying\n");
 1763                         again = 0;
 1764                         goto retry;
 1765                 }
 1766         }
 1767 }
 1768 
 1769 /*
 1770  * Sync and unmount file systems before shutting down.
 1771  */
 1772 void
 1773 vfs_shutdown(struct proc *p)
 1774 {
 1775 #ifdef ACCOUNTING
 1776         acct_shutdown();
 1777 #endif
 1778 
 1779         printf("syncing disks...");
 1780 
 1781         if (panicstr == NULL) {
 1782                 /* Sync before unmount, in case we hang on something. */
 1783                 sys_sync(p, NULL, NULL);
 1784                 vfs_unmountall();
 1785         }
 1786 
 1787 #if NSOFTRAID > 0
 1788         sr_quiesce();
 1789 #endif
 1790 
 1791         if (vfs_syncwait(p, 1))
 1792                 printf(" giving up\n");
 1793         else
 1794                 printf(" done\n");
 1795 }
 1796 
 1797 /*
 1798  * perform sync() operation and wait for buffers to flush.
 1799  */
 1800 int
 1801 vfs_syncwait(struct proc *p, int verbose)
 1802 {
 1803         struct buf *bp;
 1804         int iter, nbusy, dcount, s;
 1805 #ifdef MULTIPROCESSOR
 1806         int hold_count;
 1807 #endif
 1808 
 1809         sys_sync(p, NULL, NULL);
 1810 
 1811         /* Wait for sync to finish. */
 1812         dcount = 10000;
 1813         for (iter = 0; iter < 20; iter++) {
 1814                 nbusy = 0;
 1815                 LIST_FOREACH(bp, &bufhead, b_list) {
 1816                         if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
 1817                                 nbusy++;
 1818                         /*
 1819                          * With soft updates, some buffers that are
 1820                          * written will be remarked as dirty until other
 1821                          * buffers are written.
 1822                          */
 1823                         if (bp->b_flags & B_DELWRI) {
 1824                                 s = splbio();
 1825                                 bremfree(bp);
 1826                                 buf_acquire(bp);
 1827                                 splx(s);
 1828                                 nbusy++;
 1829                                 bawrite(bp);
 1830                                 if (dcount-- <= 0) {
 1831                                         if (verbose)
 1832                                                 printf("softdep ");
 1833                                         return 1;
 1834                                 }
 1835                         }
 1836                 }
 1837                 if (nbusy == 0)
 1838                         break;
 1839                 if (verbose)
 1840                         printf("%d ", nbusy);
 1841 #ifdef MULTIPROCESSOR
 1842                 if (_kernel_lock_held())
 1843                         hold_count = __mp_release_all(&kernel_lock);
 1844                 else
 1845                         hold_count = 0;
 1846 #endif
 1847                 DELAY(40000 * iter);
 1848 #ifdef MULTIPROCESSOR
 1849                 if (hold_count)
 1850                         __mp_acquire_count(&kernel_lock, hold_count);
 1851 #endif
 1852         }
 1853 
 1854         return nbusy;
 1855 }
 1856 
 1857 /*
 1858  * posix file system related system variables.
 1859  */
 1860 int
 1861 fs_posix_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
 1862     void *newp, size_t newlen, struct proc *p)
 1863 {
 1864         /* all sysctl names at this level are terminal */
 1865         if (namelen != 1)
 1866                 return (ENOTDIR);
 1867 
 1868         switch (name[0]) {
 1869         case FS_POSIX_SETUID:
 1870                 return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
 1871                     &suid_clear));
 1872         default:
 1873                 return (EOPNOTSUPP);
 1874         }
 1875         /* NOTREACHED */
 1876 }
 1877 
 1878 /*
 1879  * file system related system variables.
 1880  */
 1881 int
 1882 fs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
 1883     size_t newlen, struct proc *p)
 1884 {
 1885         sysctlfn *fn;
 1886 
 1887         switch (name[0]) {
 1888         case FS_POSIX:
 1889                 fn = fs_posix_sysctl;
 1890                 break;
 1891         default:
 1892                 return (EOPNOTSUPP);
 1893         }
 1894         return (*fn)(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p);
 1895 }
 1896 
 1897 
 1898 /*
 1899  * Routines dealing with vnodes and buffers
 1900  */
 1901 
 1902 /*
 1903  * Wait for all outstanding I/Os to complete
 1904  *
 1905  * Manipulates v_numoutput. Must be called at splbio()
 1906  */
 1907 int
 1908 vwaitforio(struct vnode *vp, int slpflag, char *wmesg, uint64_t timeo)
 1909 {
 1910         int error = 0;
 1911 
 1912         splassert(IPL_BIO);
 1913 
 1914         while (vp->v_numoutput) {
 1915                 vp->v_bioflag |= VBIOWAIT;
 1916                 error = tsleep_nsec(&vp->v_numoutput,
 1917                     slpflag | (PRIBIO + 1), wmesg, timeo);
 1918                 if (error)
 1919                         break;
 1920         }
 1921 
 1922         return (error);
 1923 }
 1924 
 1925 /*
 1926  * Update outstanding I/O count and do wakeup if requested.
 1927  *
 1928  * Manipulates v_numoutput. Must be called at splbio()
 1929  */
 1930 void
 1931 vwakeup(struct vnode *vp)
 1932 {
 1933         splassert(IPL_BIO);
 1934 
 1935         if (vp != NULL) {
 1936                 if (vp->v_numoutput-- == 0)
 1937                         panic("vwakeup: neg numoutput");
 1938                 if ((vp->v_bioflag & VBIOWAIT) && vp->v_numoutput == 0) {
 1939                         vp->v_bioflag &= ~VBIOWAIT;
 1940                         wakeup(&vp->v_numoutput);
 1941                 }
 1942         }
 1943 }
 1944 
 1945 /*
 1946  * Flush out and invalidate all buffers associated with a vnode.
 1947  * Called with the underlying object locked.
 1948  */
 1949 int
 1950 vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, struct proc *p,
 1951     int slpflag, uint64_t slptimeo)
 1952 {
 1953         struct buf *bp;
 1954         struct buf *nbp, *blist;
 1955         int s, error;
 1956 
 1957 #ifdef VFSLCKDEBUG
 1958         if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp))
 1959                 panic("%s: vp isn't locked, vp %p", __func__, vp);
 1960 #endif
 1961 
 1962         if (flags & V_SAVE) {
 1963                 s = splbio();
 1964                 vwaitforio(vp, 0, "vinvalbuf", INFSLP);
 1965                 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
 1966                         splx(s);
 1967                         if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
 1968                                 return (error);
 1969                         s = splbio();
 1970                         if (vp->v_numoutput > 0 ||
 1971                             !LIST_EMPTY(&vp->v_dirtyblkhd))
 1972                                 panic("%s: dirty bufs, vp %p", __func__, vp);
 1973                 }
 1974                 splx(s);
 1975         }
 1976 loop:
 1977         s = splbio();
 1978         for (;;) {
 1979                 int count = 0;
 1980                 if ((blist = LIST_FIRST(&vp->v_cleanblkhd)) &&
 1981                     (flags & V_SAVEMETA))
 1982                         while (blist && blist->b_lblkno < 0)
 1983                                 blist = LIST_NEXT(blist, b_vnbufs);
 1984                 if (blist == NULL &&
 1985                     (blist = LIST_FIRST(&vp->v_dirtyblkhd)) &&
 1986                     (flags & V_SAVEMETA))
 1987                         while (blist && blist->b_lblkno < 0)
 1988                                 blist = LIST_NEXT(blist, b_vnbufs);
 1989                 if (!blist)
 1990                         break;
 1991 
 1992                 for (bp = blist; bp; bp = nbp) {
 1993                         nbp = LIST_NEXT(bp, b_vnbufs);
 1994                         if (flags & V_SAVEMETA && bp->b_lblkno < 0)
 1995                                 continue;
 1996                         if (bp->b_flags & B_BUSY) {
 1997                                 bp->b_flags |= B_WANTED;
 1998                                 error = tsleep_nsec(bp, slpflag | (PRIBIO + 1),
 1999                                     "vinvalbuf", slptimeo);
 2000                                 if (error) {
 2001                                         splx(s);
 2002                                         return (error);
 2003                                 }
 2004                                 break;
 2005                         }
 2006                         bremfree(bp);
 2007                         /*
 2008                          * XXX Since there are no node locks for NFS, I believe
 2009                          * there is a slight chance that a delayed write will
 2010                          * occur while sleeping just above, so check for it.
 2011                          */
 2012                         if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
 2013                                 buf_acquire(bp);
 2014                                 splx(s);
 2015                                 (void) VOP_BWRITE(bp);
 2016                                 goto loop;
 2017                         }
 2018                         buf_acquire_nomap(bp);
 2019                         bp->b_flags |= B_INVAL;
 2020                         brelse(bp);
 2021                         count++;
 2022                         /*
 2023                          * XXX Temporary workaround XXX
 2024                          *
 2025                          * If this is a gigantisch vnode and we are
 2026                          * trashing a ton of buffers, drop the lock
 2027                          * and yield every so often. The longer term
 2028                          * fix is to add a separate list for these
 2029                          * invalid buffers so we don't have to do the
 2030                          * work to free these here.
 2031                          */
 2032                         if (count > 100) {
 2033                                 splx(s);
 2034                                 sched_pause(yield);
 2035                                 goto loop;
 2036                         }
 2037                 }
 2038         }
 2039         if (!(flags & V_SAVEMETA) &&
 2040             (!LIST_EMPTY(&vp->v_dirtyblkhd) || !LIST_EMPTY(&vp->v_cleanblkhd)))
 2041                 panic("%s: flush failed, vp %p", __func__, vp);
 2042         splx(s);
 2043         return (0);
 2044 }
 2045 
 2046 void
 2047 vflushbuf(struct vnode *vp, int sync)
 2048 {
 2049         struct buf *bp, *nbp;
 2050         int s;
 2051 
 2052 loop:
 2053         s = splbio();
 2054         LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
 2055                 if ((bp->b_flags & B_BUSY))
 2056                         continue;
 2057                 if ((bp->b_flags & B_DELWRI) == 0)
 2058                         panic("vflushbuf: not dirty");
 2059                 bremfree(bp);
 2060                 buf_acquire(bp);
 2061                 splx(s);
 2062                 /*
 2063                  * Wait for I/O associated with indirect blocks to complete,
 2064                  * since there is no way to quickly wait for them below.
 2065                  */
 2066                 if (bp->b_vp == vp || sync == 0)
 2067                         (void) bawrite(bp);
 2068                 else
 2069                         (void) bwrite(bp);
 2070                 goto loop;
 2071         }
 2072         if (sync == 0) {
 2073                 splx(s);
 2074                 return;
 2075         }
 2076         vwaitforio(vp, 0, "vflushbuf", INFSLP);
 2077         if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
 2078                 splx(s);
 2079 #ifdef DIAGNOSTIC
 2080                 vprint("vflushbuf: dirty", vp);
 2081 #endif
 2082                 goto loop;
 2083         }
 2084         splx(s);
 2085 }
 2086 
 2087 /*
 2088  * Associate a buffer with a vnode.
 2089  *
 2090  * Manipulates buffer vnode queues. Must be called at splbio().
 2091  */
 2092 void
 2093 bgetvp(struct vnode *vp, struct buf *bp)
 2094 {
 2095         splassert(IPL_BIO);
 2096 
 2097 
 2098         if (bp->b_vp)
 2099                 panic("bgetvp: not free");
 2100         vhold(vp);
 2101         bp->b_vp = vp;
 2102         if (vp->v_type == VBLK || vp->v_type == VCHR)
 2103                 bp->b_dev = vp->v_rdev;
 2104         else
 2105                 bp->b_dev = NODEV;
 2106         /*
 2107          * Insert onto list for new vnode.
 2108          */
 2109         bufinsvn(bp, &vp->v_cleanblkhd);
 2110 }
 2111 
 2112 /*
 2113  * Disassociate a buffer from a vnode.
 2114  *
 2115  * Manipulates vnode buffer queues. Must be called at splbio().
 2116  */
 2117 void
 2118 brelvp(struct buf *bp)
 2119 {
 2120         struct vnode *vp;
 2121 
 2122         splassert(IPL_BIO);
 2123 
 2124         if ((vp = bp->b_vp) == (struct vnode *) 0)
 2125                 panic("brelvp: NULL");
 2126         /*
 2127          * Delete from old vnode list, if on one.
 2128          */
 2129         if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
 2130                 bufremvn(bp);
 2131         if ((vp->v_bioflag & VBIOONSYNCLIST) &&
 2132             LIST_EMPTY(&vp->v_dirtyblkhd)) {
 2133                 vp->v_bioflag &= ~VBIOONSYNCLIST;
 2134                 LIST_REMOVE(vp, v_synclist);
 2135         }
 2136         bp->b_vp = NULL;
 2137 
 2138         vdrop(vp);
 2139 }
 2140 
 2141 /*
 2142  * Replaces the current vnode associated with the buffer, if any,
 2143  * with a new vnode.
 2144  *
 2145  * If an output I/O is pending on the buffer, the old vnode
 2146  * I/O count is adjusted.
 2147  *
 2148  * Ignores vnode buffer queues. Must be called at splbio().
 2149  */
 2150 void
 2151 buf_replacevnode(struct buf *bp, struct vnode *newvp)
 2152 {
 2153         struct vnode *oldvp = bp->b_vp;
 2154 
 2155         splassert(IPL_BIO);
 2156 
 2157         if (oldvp)
 2158                 brelvp(bp);
 2159 
 2160         if ((bp->b_flags & (B_READ | B_DONE)) == 0) {
 2161                 newvp->v_numoutput++;   /* put it on swapdev */
 2162                 vwakeup(oldvp);
 2163         }
 2164 
 2165         bgetvp(newvp, bp);
 2166         bufremvn(bp);
 2167 }
 2168 
 2169 /*
 2170  * Used to assign buffers to the appropriate clean or dirty list on
 2171  * the vnode and to add newly dirty vnodes to the appropriate
 2172  * filesystem syncer list.
 2173  *
 2174  * Manipulates vnode buffer queues. Must be called at splbio().
 2175  */
 2176 void
 2177 reassignbuf(struct buf *bp)
 2178 {
 2179         struct buflists *listheadp;
 2180         int delay;
 2181         struct vnode *vp = bp->b_vp;
 2182 
 2183         splassert(IPL_BIO);
 2184 
 2185         /*
 2186          * Delete from old vnode list, if on one.
 2187          */
 2188         if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
 2189                 bufremvn(bp);
 2190 
 2191         /*
 2192          * If dirty, put on list of dirty buffers;
 2193          * otherwise insert onto list of clean buffers.
 2194          */
 2195         if ((bp->b_flags & B_DELWRI) == 0) {
 2196                 listheadp = &vp->v_cleanblkhd;
 2197                 if ((vp->v_bioflag & VBIOONSYNCLIST) &&
 2198                     LIST_EMPTY(&vp->v_dirtyblkhd)) {
 2199                         vp->v_bioflag &= ~VBIOONSYNCLIST;
 2200                         LIST_REMOVE(vp, v_synclist);
 2201                 }
 2202         } else {
 2203                 listheadp = &vp->v_dirtyblkhd;
 2204                 if ((vp->v_bioflag & VBIOONSYNCLIST) == 0) {
 2205                         switch (vp->v_type) {
 2206                         case VDIR:
 2207                                 delay = syncdelay / 2;
 2208                                 break;
 2209                         case VBLK:
 2210                                 if (vp->v_specmountpoint != NULL) {
 2211                                         delay = syncdelay / 3;
 2212                                         break;
 2213                                 }
 2214                                 /* FALLTHROUGH */
 2215                         default:
 2216                                 delay = syncdelay;
 2217                         }
 2218                         vn_syncer_add_to_worklist(vp, delay);
 2219                 }
 2220         }
 2221         bufinsvn(bp, listheadp);
 2222 }
 2223 
 2224 /*
 2225  * Check if vnode represents a disk device
 2226  */
 2227 int
 2228 vn_isdisk(struct vnode *vp, int *errp)
 2229 {
 2230         if (vp->v_type != VBLK && vp->v_type != VCHR)
 2231                 return (0);
 2232 
 2233         return (1);
 2234 }
 2235 
 2236 #ifdef DDB
 2237 #include <machine/db_machdep.h>
 2238 #include <ddb/db_interface.h>
 2239 
 2240 void
 2241 vfs_buf_print(void *b, int full,
 2242     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
 2243 {
 2244         struct buf *bp = b;
 2245 
 2246         (*pr)("  vp %p lblkno 0x%llx blkno 0x%llx dev 0x%x\n"
 2247               "  proc %p error %d flags %lb\n",
 2248             bp->b_vp, (int64_t)bp->b_lblkno, (int64_t)bp->b_blkno, bp->b_dev,
 2249             bp->b_proc, bp->b_error, bp->b_flags, B_BITS);
 2250 
 2251         (*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n"
 2252               "  data %p saveaddr %p dep %p iodone %p\n",
 2253             bp->b_bufsize, bp->b_bcount, (long)bp->b_resid,
 2254             bp->b_data, bp->b_saveaddr,
 2255             LIST_FIRST(&bp->b_dep), bp->b_iodone);
 2256 
 2257         (*pr)("  dirty {off 0x%x end 0x%x} valid {off 0x%x end 0x%x}\n",
 2258             bp->b_dirtyoff, bp->b_dirtyend, bp->b_validoff, bp->b_validend);
 2259 
 2260 #ifdef FFS_SOFTUPDATES
 2261         if (full)
 2262                 softdep_print(bp, full, pr);
 2263 #endif
 2264 }
 2265 
 2266 const char *vtypes[] = { VTYPE_NAMES };
 2267 const char *vtags[] = { VTAG_NAMES };
 2268 
 2269 void
 2270 vfs_vnode_print(void *v, int full,
 2271     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
 2272 {
 2273         struct vnode *vp = v;
 2274 
 2275         (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
 2276               (u_int)vp->v_tag >= nitems(vtags)? "<unk>":vtags[vp->v_tag],
 2277               vp->v_tag,
 2278               (u_int)vp->v_type >= nitems(vtypes)? "<unk>":vtypes[vp->v_type],
 2279               vp->v_type, vp->v_mount, vp->v_mountedhere);
 2280 
 2281         (*pr)("data %p usecount %d writecount %d holdcnt %d numoutput %d\n",
 2282               vp->v_data, vp->v_usecount, vp->v_writecount,
 2283               vp->v_holdcnt, vp->v_numoutput);
 2284 
 2285         /* uvm_object_printit(&vp->v_uobj, full, pr); */
 2286 
 2287         if (full) {
 2288                 struct buf *bp;
 2289 
 2290                 (*pr)("clean bufs:\n");
 2291                 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
 2292                         (*pr)(" bp %p\n", bp);
 2293                         vfs_buf_print(bp, full, pr);
 2294                 }
 2295 
 2296                 (*pr)("dirty bufs:\n");
 2297                 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
 2298                         (*pr)(" bp %p\n", bp);
 2299                         vfs_buf_print(bp, full, pr);
 2300                 }
 2301         }
 2302 }
 2303 
 2304 void
 2305 vfs_mount_print(struct mount *mp, int full,
 2306     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
 2307 {
 2308         struct vfsconf *vfc = mp->mnt_vfc;
 2309         struct vnode *vp;
 2310         int cnt;
 2311 
 2312         (*pr)("flags %b\nvnodecovered %p syncer %p data %p\n",
 2313             mp->mnt_flag, MNT_BITS,
 2314             mp->mnt_vnodecovered, mp->mnt_syncer, mp->mnt_data);
 2315 
 2316         (*pr)("vfsconf: ops %p name \"%s\" num %d ref %u flags 0x%x\n",
 2317             vfc->vfc_vfsops, vfc->vfc_name, vfc->vfc_typenum,
 2318             vfc->vfc_refcount, vfc->vfc_flags);
 2319 
 2320         (*pr)("statvfs cache: bsize %x iosize %x\n"
 2321             "blocks %llu free %llu avail %lld\n",
 2322             mp->mnt_stat.f_bsize, mp->mnt_stat.f_iosize, mp->mnt_stat.f_blocks,
 2323             mp->mnt_stat.f_bfree, mp->mnt_stat.f_bavail);
 2324 
 2325         (*pr)("  files %llu ffiles %llu favail %lld\n", mp->mnt_stat.f_files,
 2326             mp->mnt_stat.f_ffree, mp->mnt_stat.f_favail);
 2327 
 2328         (*pr)("  f_fsidx {0x%x, 0x%x} owner %u ctime 0x%llx\n",
 2329             mp->mnt_stat.f_fsid.val[0], mp->mnt_stat.f_fsid.val[1],
 2330             mp->mnt_stat.f_owner, mp->mnt_stat.f_ctime);
 2331 
 2332         (*pr)("  syncwrites %llu asyncwrites = %llu\n",
 2333             mp->mnt_stat.f_syncwrites, mp->mnt_stat.f_asyncwrites);
 2334 
 2335         (*pr)("  syncreads %llu asyncreads = %llu\n",
 2336             mp->mnt_stat.f_syncreads, mp->mnt_stat.f_asyncreads);
 2337 
 2338         (*pr)("  fstype \"%s\" mnton \"%s\" mntfrom \"%s\" mntspec \"%s\"\n",
 2339             mp->mnt_stat.f_fstypename, mp->mnt_stat.f_mntonname,
 2340             mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromspec);
 2341 
 2342         (*pr)("locked vnodes:");
 2343         /* XXX would take mountlist lock, except ddb has no context */
 2344         cnt = 0;
 2345         TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 2346                 if (VOP_ISLOCKED(vp)) {
 2347                         if (cnt == 0)
 2348                                 (*pr)("\n  %p", vp);
 2349                         else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
 2350                                 (*pr)(",\n  %p", vp);
 2351                         else
 2352                                 (*pr)(", %p", vp);
 2353                         cnt++;
 2354                 }
 2355         }
 2356         (*pr)("\n");
 2357 
 2358         if (full) {
 2359                 (*pr)("all vnodes:");
 2360                 /* XXX would take mountlist lock, except ddb has no context */
 2361                 cnt = 0;
 2362                 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
 2363                         if (cnt == 0)
 2364                                 (*pr)("\n  %p", vp);
 2365                         else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
 2366                                 (*pr)(",\n  %p", vp);
 2367                         else
 2368                                 (*pr)(", %p", vp);
 2369                         cnt++;
 2370                 }
 2371                 (*pr)("\n");
 2372         }
 2373 }
 2374 #endif /* DDB */
 2375 
 2376 void
 2377 copy_statfs_info(struct statfs *sbp, const struct mount *mp)
 2378 {
 2379         const struct statfs *mbp;
 2380 
 2381         strncpy(sbp->f_fstypename, mp->mnt_vfc->vfc_name, MFSNAMELEN);
 2382 
 2383         if (sbp == (mbp = &mp->mnt_stat))
 2384                 return;
 2385 
 2386         sbp->f_fsid = mbp->f_fsid;
 2387         sbp->f_owner = mbp->f_owner;
 2388         sbp->f_flags = mbp->f_flags;
 2389         sbp->f_syncwrites = mbp->f_syncwrites;
 2390         sbp->f_asyncwrites = mbp->f_asyncwrites;
 2391         sbp->f_syncreads = mbp->f_syncreads;
 2392         sbp->f_asyncreads = mbp->f_asyncreads;
 2393         sbp->f_namemax = mbp->f_namemax;
 2394         memcpy(sbp->f_mntonname, mp->mnt_stat.f_mntonname, MNAMELEN);
 2395         memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, MNAMELEN);
 2396         memcpy(sbp->f_mntfromspec, mp->mnt_stat.f_mntfromspec, MNAMELEN);
 2397         memcpy(&sbp->mount_info, &mp->mnt_stat.mount_info,
 2398             sizeof(union mount_info));
 2399 }
Cache object: 48ef301cd5dee3922780ca649e4ad0ae
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_subr.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_subr.c