The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_subr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
   37  */
   38 
   39 /*
   40  * External virtual filesystem routines
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD$");
   45 
   46 #include "opt_ddb.h"
   47 #include "opt_watchdog.h"
   48 
   49 #include <sys/param.h>
   50 #include <sys/systm.h>
   51 #include <sys/asan.h>
   52 #include <sys/bio.h>
   53 #include <sys/buf.h>
   54 #include <sys/capsicum.h>
   55 #include <sys/condvar.h>
   56 #include <sys/conf.h>
   57 #include <sys/counter.h>
   58 #include <sys/dirent.h>
   59 #include <sys/event.h>
   60 #include <sys/eventhandler.h>
   61 #include <sys/extattr.h>
   62 #include <sys/file.h>
   63 #include <sys/fcntl.h>
   64 #include <sys/jail.h>
   65 #include <sys/kdb.h>
   66 #include <sys/kernel.h>
   67 #include <sys/kthread.h>
   68 #include <sys/ktr.h>
   69 #include <sys/lockf.h>
   70 #include <sys/malloc.h>
   71 #include <sys/mount.h>
   72 #include <sys/namei.h>
   73 #include <sys/pctrie.h>
   74 #include <sys/priv.h>
   75 #include <sys/reboot.h>
   76 #include <sys/refcount.h>
   77 #include <sys/rwlock.h>
   78 #include <sys/sched.h>
   79 #include <sys/sleepqueue.h>
   80 #include <sys/smr.h>
   81 #include <sys/smp.h>
   82 #include <sys/stat.h>
   83 #include <sys/sysctl.h>
   84 #include <sys/syslog.h>
   85 #include <sys/vmmeter.h>
   86 #include <sys/vnode.h>
   87 #include <sys/watchdog.h>
   88 
   89 #include <machine/stdarg.h>
   90 
   91 #include <security/mac/mac_framework.h>
   92 
   93 #include <vm/vm.h>
   94 #include <vm/vm_object.h>
   95 #include <vm/vm_extern.h>
   96 #include <vm/pmap.h>
   97 #include <vm/vm_map.h>
   98 #include <vm/vm_page.h>
   99 #include <vm/vm_kern.h>
  100 #include <vm/uma.h>
  101 
  102 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS))
  103 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS
  104 #endif
  105 
  106 #ifdef DDB
  107 #include <ddb/ddb.h>
  108 #endif
  109 
  110 static void     delmntque(struct vnode *vp);
  111 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
  112                     int slpflag, int slptimeo);
  113 static void     syncer_shutdown(void *arg, int howto);
  114 static int      vtryrecycle(struct vnode *vp);
  115 static void     v_init_counters(struct vnode *);
  116 static void     vn_seqc_init(struct vnode *);
  117 static void     vn_seqc_write_end_free(struct vnode *vp);
  118 static void     vgonel(struct vnode *);
  119 static bool     vhold_recycle_free(struct vnode *);
  120 static void     vdropl_recycle(struct vnode *vp);
  121 static void     vdrop_recycle(struct vnode *vp);
  122 static void     vfs_knllock(void *arg);
  123 static void     vfs_knlunlock(void *arg);
  124 static void     vfs_knl_assert_lock(void *arg, int what);
  125 static void     destroy_vpollinfo(struct vpollinfo *vi);
  126 static int      v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
  127                     daddr_t startlbn, daddr_t endlbn);
  128 static void     vnlru_recalc(void);
  129 
  130 /*
  131  * Number of vnodes in existence.  Increased whenever getnewvnode()
  132  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
  133  */
  134 static u_long __exclusive_cache_line numvnodes;
  135 
  136 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
  137     "Number of vnodes in existence");
  138 
  139 static counter_u64_t vnodes_created;
  140 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
  141     "Number of vnodes created by getnewvnode");
  142 
  143 /*
  144  * Conversion tables for conversion from vnode types to inode formats
  145  * and back.
  146  */
  147 enum vtype iftovt_tab[16] = {
  148         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
  149         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
  150 };
  151 int vttoif_tab[10] = {
  152         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
  153         S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
  154 };
  155 
  156 /*
  157  * List of allocates vnodes in the system.
  158  */
  159 static TAILQ_HEAD(freelst, vnode) vnode_list;
  160 static struct vnode *vnode_list_free_marker;
  161 static struct vnode *vnode_list_reclaim_marker;
  162 
  163 /*
  164  * "Free" vnode target.  Free vnodes are rarely completely free, but are
  165  * just ones that are cheap to recycle.  Usually they are for files which
  166  * have been stat'd but not read; these usually have inode and namecache
  167  * data attached to them.  This target is the preferred minimum size of a
  168  * sub-cache consisting mostly of such files. The system balances the size
  169  * of this sub-cache with its complement to try to prevent either from
  170  * thrashing while the other is relatively inactive.  The targets express
  171  * a preference for the best balance.
  172  *
  173  * "Above" this target there are 2 further targets (watermarks) related
  174  * to recyling of free vnodes.  In the best-operating case, the cache is
  175  * exactly full, the free list has size between vlowat and vhiwat above the
  176  * free target, and recycling from it and normal use maintains this state.
  177  * Sometimes the free list is below vlowat or even empty, but this state
  178  * is even better for immediate use provided the cache is not full.
  179  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
  180  * ones) to reach one of these states.  The watermarks are currently hard-
  181  * coded as 4% and 9% of the available space higher.  These and the default
  182  * of 25% for wantfreevnodes are too large if the memory size is large.
  183  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
  184  * whenever vnlru_proc() becomes active.
  185  */
  186 static long wantfreevnodes;
  187 static long __exclusive_cache_line freevnodes;
  188 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
  189     &freevnodes, 0, "Number of \"free\" vnodes");
  190 static long freevnodes_old;
  191 
  192 static counter_u64_t recycles_count;
  193 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
  194     "Number of vnodes recycled to meet vnode cache targets");
  195 
  196 static counter_u64_t recycles_free_count;
  197 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
  198     "Number of free vnodes recycled to meet vnode cache targets");
  199 
  200 static counter_u64_t deferred_inact;
  201 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
  202     "Number of times inactive processing was deferred");
  203 
  204 /* To keep more than one thread at a time from running vfs_getnewfsid */
  205 static struct mtx mntid_mtx;
  206 
  207 /*
  208  * Lock for any access to the following:
  209  *      vnode_list
  210  *      numvnodes
  211  *      freevnodes
  212  */
  213 static struct mtx __exclusive_cache_line vnode_list_mtx;
  214 
  215 /* Publicly exported FS */
  216 struct nfs_public nfs_pub;
  217 
  218 static uma_zone_t buf_trie_zone;
  219 static smr_t buf_trie_smr;
  220 
  221 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
  222 static uma_zone_t vnode_zone;
  223 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll");
  224 
  225 __read_frequently smr_t vfs_smr;
  226 
  227 /*
  228  * The workitem queue.
  229  *
  230  * It is useful to delay writes of file data and filesystem metadata
  231  * for tens of seconds so that quickly created and deleted files need
  232  * not waste disk bandwidth being created and removed. To realize this,
  233  * we append vnodes to a "workitem" queue. When running with a soft
  234  * updates implementation, most pending metadata dependencies should
  235  * not wait for more than a few seconds. Thus, mounted on block devices
  236  * are delayed only about a half the time that file data is delayed.
  237  * Similarly, directory updates are more critical, so are only delayed
  238  * about a third the time that file data is delayed. Thus, there are
  239  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  240  * one each second (driven off the filesystem syncer process). The
  241  * syncer_delayno variable indicates the next queue that is to be processed.
  242  * Items that need to be processed soon are placed in this queue:
  243  *
  244  *      syncer_workitem_pending[syncer_delayno]
  245  *
  246  * A delay of fifteen seconds is done by placing the request fifteen
  247  * entries later in the queue:
  248  *
  249  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  250  *
  251  */
  252 static int syncer_delayno;
  253 static long syncer_mask;
  254 LIST_HEAD(synclist, bufobj);
  255 static struct synclist *syncer_workitem_pending;
  256 /*
  257  * The sync_mtx protects:
  258  *      bo->bo_synclist
  259  *      sync_vnode_count
  260  *      syncer_delayno
  261  *      syncer_state
  262  *      syncer_workitem_pending
  263  *      syncer_worklist_len
  264  *      rushjob
  265  */
  266 static struct mtx sync_mtx;
  267 static struct cv sync_wakeup;
  268 
  269 #define SYNCER_MAXDELAY         32
  270 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
  271 static int syncdelay = 30;              /* max time to delay syncing data */
  272 static int filedelay = 30;              /* time to delay syncing files */
  273 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
  274     "Time to delay syncing files (in seconds)");
  275 static int dirdelay = 29;               /* time to delay syncing directories */
  276 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
  277     "Time to delay syncing directories (in seconds)");
  278 static int metadelay = 28;              /* time to delay syncing metadata */
  279 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
  280     "Time to delay syncing metadata (in seconds)");
  281 static int rushjob;             /* number of slots to run ASAP */
  282 static int stat_rush_requests;  /* number of times I/O speeded up */
  283 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
  284     "Number of times I/O speeded up (rush requests)");
  285 
  286 #define VDBATCH_SIZE 8
  287 struct vdbatch {
  288         u_int index;
  289         long freevnodes;
  290         struct mtx lock;
  291         struct vnode *tab[VDBATCH_SIZE];
  292 };
  293 DPCPU_DEFINE_STATIC(struct vdbatch, vd);
  294 
  295 static void     vdbatch_dequeue(struct vnode *vp);
  296 
  297 /*
  298  * When shutting down the syncer, run it at four times normal speed.
  299  */
  300 #define SYNCER_SHUTDOWN_SPEEDUP         4
  301 static int sync_vnode_count;
  302 static int syncer_worklist_len;
  303 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
  304     syncer_state;
  305 
  306 /* Target for maximum number of vnodes. */
  307 u_long desiredvnodes;
  308 static u_long gapvnodes;                /* gap between wanted and desired */
  309 static u_long vhiwat;           /* enough extras after expansion */
  310 static u_long vlowat;           /* minimal extras before expansion */
  311 static u_long vstir;            /* nonzero to stir non-free vnodes */
  312 static volatile int vsmalltrigger = 8;  /* pref to keep if > this many pages */
  313 
  314 static u_long vnlru_read_freevnodes(void);
  315 
  316 /*
  317  * Note that no attempt is made to sanitize these parameters.
  318  */
  319 static int
  320 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
  321 {
  322         u_long val;
  323         int error;
  324 
  325         val = desiredvnodes;
  326         error = sysctl_handle_long(oidp, &val, 0, req);
  327         if (error != 0 || req->newptr == NULL)
  328                 return (error);
  329 
  330         if (val == desiredvnodes)
  331                 return (0);
  332         mtx_lock(&vnode_list_mtx);
  333         desiredvnodes = val;
  334         wantfreevnodes = desiredvnodes / 4;
  335         vnlru_recalc();
  336         mtx_unlock(&vnode_list_mtx);
  337         /*
  338          * XXX There is no protection against multiple threads changing
  339          * desiredvnodes at the same time. Locking above only helps vnlru and
  340          * getnewvnode.
  341          */
  342         vfs_hash_changesize(desiredvnodes);
  343         cache_changesize(desiredvnodes);
  344         return (0);
  345 }
  346 
  347 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
  348     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
  349     "LU", "Target for maximum number of vnodes");
  350 
  351 static int
  352 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
  353 {
  354         u_long val;
  355         int error;
  356 
  357         val = wantfreevnodes;
  358         error = sysctl_handle_long(oidp, &val, 0, req);
  359         if (error != 0 || req->newptr == NULL)
  360                 return (error);
  361 
  362         if (val == wantfreevnodes)
  363                 return (0);
  364         mtx_lock(&vnode_list_mtx);
  365         wantfreevnodes = val;
  366         vnlru_recalc();
  367         mtx_unlock(&vnode_list_mtx);
  368         return (0);
  369 }
  370 
  371 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
  372     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
  373     "LU", "Target for minimum number of \"free\" vnodes");
  374 
  375 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
  376     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
  377 static int vnlru_nowhere;
  378 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
  379     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
  380 
  381 static int
  382 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
  383 {
  384         struct vnode *vp;
  385         struct nameidata nd;
  386         char *buf;
  387         unsigned long ndflags;
  388         int error;
  389 
  390         if (req->newptr == NULL)
  391                 return (EINVAL);
  392         if (req->newlen >= PATH_MAX)
  393                 return (E2BIG);
  394 
  395         buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
  396         error = SYSCTL_IN(req, buf, req->newlen);
  397         if (error != 0)
  398                 goto out;
  399 
  400         buf[req->newlen] = '\0';
  401 
  402         ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME;
  403         NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
  404         if ((error = namei(&nd)) != 0)
  405                 goto out;
  406         vp = nd.ni_vp;
  407 
  408         if (VN_IS_DOOMED(vp)) {
  409                 /*
  410                  * This vnode is being recycled.  Return != 0 to let the caller
  411                  * know that the sysctl had no effect.  Return EAGAIN because a
  412                  * subsequent call will likely succeed (since namei will create
  413                  * a new vnode if necessary)
  414                  */
  415                 error = EAGAIN;
  416                 goto putvnode;
  417         }
  418 
  419         counter_u64_add(recycles_count, 1);
  420         vgone(vp);
  421 putvnode:
  422         NDFREE(&nd, 0);
  423 out:
  424         free(buf, M_TEMP);
  425         return (error);
  426 }
  427 
  428 static int
  429 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
  430 {
  431         struct thread *td = curthread;
  432         struct vnode *vp;
  433         struct file *fp;
  434         int error;
  435         int fd;
  436 
  437         if (req->newptr == NULL)
  438                 return (EBADF);
  439 
  440         error = sysctl_handle_int(oidp, &fd, 0, req);
  441         if (error != 0)
  442                 return (error);
  443         error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
  444         if (error != 0)
  445                 return (error);
  446         vp = fp->f_vnode;
  447 
  448         error = vn_lock(vp, LK_EXCLUSIVE);
  449         if (error != 0)
  450                 goto drop;
  451 
  452         counter_u64_add(recycles_count, 1);
  453         vgone(vp);
  454         VOP_UNLOCK(vp);
  455 drop:
  456         fdrop(fp, td);
  457         return (error);
  458 }
  459 
  460 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
  461     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
  462     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
  463 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
  464     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
  465     sysctl_ftry_reclaim_vnode, "I",
  466     "Try to reclaim a vnode by its file descriptor");
  467 
  468 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
  469 static int vnsz2log;
  470 
  471 /*
  472  * Support for the bufobj clean & dirty pctrie.
  473  */
  474 static void *
  475 buf_trie_alloc(struct pctrie *ptree)
  476 {
  477         return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
  478 }
  479 
  480 static void
  481 buf_trie_free(struct pctrie *ptree, void *node)
  482 {
  483         uma_zfree_smr(buf_trie_zone, node);
  484 }
  485 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
  486     buf_trie_smr);
  487 
  488 /*
  489  * Initialize the vnode management data structures.
  490  *
  491  * Reevaluate the following cap on the number of vnodes after the physical
  492  * memory size exceeds 512GB.  In the limit, as the physical memory size
  493  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
  494  */
  495 #ifndef MAXVNODES_MAX
  496 #define MAXVNODES_MAX   (512UL * 1024 * 1024 / 64)      /* 8M */
  497 #endif
  498 
  499 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
  500 
  501 static struct vnode *
  502 vn_alloc_marker(struct mount *mp)
  503 {
  504         struct vnode *vp;
  505 
  506         vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
  507         vp->v_type = VMARKER;
  508         vp->v_mount = mp;
  509 
  510         return (vp);
  511 }
  512 
  513 static void
  514 vn_free_marker(struct vnode *vp)
  515 {
  516 
  517         MPASS(vp->v_type == VMARKER);
  518         free(vp, M_VNODE_MARKER);
  519 }
  520 
  521 #ifdef KASAN
  522 static int
  523 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused)
  524 {
  525         kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0);
  526         return (0);
  527 }
  528 
  529 static void
  530 vnode_dtor(void *mem, int size, void *arg __unused)
  531 {
  532         size_t end1, end2, off1, off2;
  533 
  534         _Static_assert(offsetof(struct vnode, v_vnodelist) <
  535             offsetof(struct vnode, v_dbatchcpu),
  536             "KASAN marks require updating");
  537 
  538         off1 = offsetof(struct vnode, v_vnodelist);
  539         off2 = offsetof(struct vnode, v_dbatchcpu);
  540         end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist);
  541         end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu);
  542 
  543         /*
  544          * Access to the v_vnodelist and v_dbatchcpu fields are permitted even
  545          * after the vnode has been freed.  Try to get some KASAN coverage by
  546          * marking everything except those two fields as invalid.  Because
  547          * KASAN's tracking is not byte-granular, any preceding fields sharing
  548          * the same 8-byte aligned word must also be marked valid.
  549          */
  550 
  551         /* Handle the area from the start until v_vnodelist... */
  552         off1 = rounddown2(off1, KASAN_SHADOW_SCALE);
  553         kasan_mark(mem, off1, off1, KASAN_UMA_FREED);
  554 
  555         /* ... then the area between v_vnodelist and v_dbatchcpu ... */
  556         off1 = roundup2(end1, KASAN_SHADOW_SCALE);
  557         off2 = rounddown2(off2, KASAN_SHADOW_SCALE);
  558         if (off2 > off1)
  559                 kasan_mark((void *)((char *)mem + off1), off2 - off1,
  560                     off2 - off1, KASAN_UMA_FREED);
  561 
  562         /* ... and finally the area from v_dbatchcpu to the end. */
  563         off2 = roundup2(end2, KASAN_SHADOW_SCALE);
  564         kasan_mark((void *)((char *)mem + off2), size - off2, size - off2,
  565             KASAN_UMA_FREED);
  566 }
  567 #endif /* KASAN */
  568 
  569 /*
  570  * Initialize a vnode as it first enters the zone.
  571  */
  572 static int
  573 vnode_init(void *mem, int size, int flags)
  574 {
  575         struct vnode *vp;
  576 
  577         vp = mem;
  578         bzero(vp, size);
  579         /*
  580          * Setup locks.
  581          */
  582         vp->v_vnlock = &vp->v_lock;
  583         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
  584         /*
  585          * By default, don't allow shared locks unless filesystems opt-in.
  586          */
  587         lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
  588             LK_NOSHARE | LK_IS_VNODE);
  589         /*
  590          * Initialize bufobj.
  591          */
  592         bufobj_init(&vp->v_bufobj, vp);
  593         /*
  594          * Initialize namecache.
  595          */
  596         cache_vnode_init(vp);
  597         /*
  598          * Initialize rangelocks.
  599          */
  600         rangelock_init(&vp->v_rl);
  601 
  602         vp->v_dbatchcpu = NOCPU;
  603 
  604         /*
  605          * Check vhold_recycle_free for an explanation.
  606          */
  607         vp->v_holdcnt = VHOLD_NO_SMR;
  608         vp->v_type = VNON;
  609         mtx_lock(&vnode_list_mtx);
  610         TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
  611         mtx_unlock(&vnode_list_mtx);
  612         return (0);
  613 }
  614 
  615 /*
  616  * Free a vnode when it is cleared from the zone.
  617  */
  618 static void
  619 vnode_fini(void *mem, int size)
  620 {
  621         struct vnode *vp;
  622         struct bufobj *bo;
  623 
  624         vp = mem;
  625         vdbatch_dequeue(vp);
  626         mtx_lock(&vnode_list_mtx);
  627         TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
  628         mtx_unlock(&vnode_list_mtx);
  629         rangelock_destroy(&vp->v_rl);
  630         lockdestroy(vp->v_vnlock);
  631         mtx_destroy(&vp->v_interlock);
  632         bo = &vp->v_bufobj;
  633         rw_destroy(BO_LOCKPTR(bo));
  634 
  635         kasan_mark(mem, size, size, 0);
  636 }
  637 
  638 /*
  639  * Provide the size of NFS nclnode and NFS fh for calculation of the
  640  * vnode memory consumption.  The size is specified directly to
  641  * eliminate dependency on NFS-private header.
  642  *
  643  * Other filesystems may use bigger or smaller (like UFS and ZFS)
  644  * private inode data, but the NFS-based estimation is ample enough.
  645  * Still, we care about differences in the size between 64- and 32-bit
  646  * platforms.
  647  *
  648  * Namecache structure size is heuristically
  649  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
  650  */
  651 #ifdef _LP64
  652 #define NFS_NCLNODE_SZ  (528 + 64)
  653 #define NC_SZ           148
  654 #else
  655 #define NFS_NCLNODE_SZ  (360 + 32)
  656 #define NC_SZ           92
  657 #endif
  658 
  659 static void
  660 vntblinit(void *dummy __unused)
  661 {
  662         struct vdbatch *vd;
  663         uma_ctor ctor;
  664         uma_dtor dtor;
  665         int cpu, physvnodes, virtvnodes;
  666         u_int i;
  667 
  668         /*
  669          * Desiredvnodes is a function of the physical memory size and the
  670          * kernel's heap size.  Generally speaking, it scales with the
  671          * physical memory size.  The ratio of desiredvnodes to the physical
  672          * memory size is 1:16 until desiredvnodes exceeds 98,304.
  673          * Thereafter, the
  674          * marginal ratio of desiredvnodes to the physical memory size is
  675          * 1:64.  However, desiredvnodes is limited by the kernel's heap
  676          * size.  The memory required by desiredvnodes vnodes and vm objects
  677          * must not exceed 1/10th of the kernel's heap size.
  678          */
  679         physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
  680             3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
  681         virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
  682             sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
  683         desiredvnodes = min(physvnodes, virtvnodes);
  684         if (desiredvnodes > MAXVNODES_MAX) {
  685                 if (bootverbose)
  686                         printf("Reducing kern.maxvnodes %lu -> %lu\n",
  687                             desiredvnodes, MAXVNODES_MAX);
  688                 desiredvnodes = MAXVNODES_MAX;
  689         }
  690         wantfreevnodes = desiredvnodes / 4;
  691         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
  692         TAILQ_INIT(&vnode_list);
  693         mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
  694         /*
  695          * The lock is taken to appease WITNESS.
  696          */
  697         mtx_lock(&vnode_list_mtx);
  698         vnlru_recalc();
  699         mtx_unlock(&vnode_list_mtx);
  700         vnode_list_free_marker = vn_alloc_marker(NULL);
  701         TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
  702         vnode_list_reclaim_marker = vn_alloc_marker(NULL);
  703         TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
  704 
  705 #ifdef KASAN
  706         ctor = vnode_ctor;
  707         dtor = vnode_dtor;
  708 #else
  709         ctor = NULL;
  710         dtor = NULL;
  711 #endif
  712         vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor,
  713             vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN);
  714         uma_zone_set_smr(vnode_zone, vfs_smr);
  715 
  716         /*
  717          * Preallocate enough nodes to support one-per buf so that
  718          * we can not fail an insert.  reassignbuf() callers can not
  719          * tolerate the insertion failure.
  720          */
  721         buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
  722             NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
  723             UMA_ZONE_NOFREE | UMA_ZONE_SMR);
  724         buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
  725         uma_prealloc(buf_trie_zone, nbuf);
  726 
  727         vnodes_created = counter_u64_alloc(M_WAITOK);
  728         recycles_count = counter_u64_alloc(M_WAITOK);
  729         recycles_free_count = counter_u64_alloc(M_WAITOK);
  730         deferred_inact = counter_u64_alloc(M_WAITOK);
  731 
  732         /*
  733          * Initialize the filesystem syncer.
  734          */
  735         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
  736             &syncer_mask);
  737         syncer_maxdelay = syncer_mask + 1;
  738         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
  739         cv_init(&sync_wakeup, "syncer");
  740         for (i = 1; i <= sizeof(struct vnode); i <<= 1)
  741                 vnsz2log++;
  742         vnsz2log--;
  743 
  744         CPU_FOREACH(cpu) {
  745                 vd = DPCPU_ID_PTR((cpu), vd);
  746                 bzero(vd, sizeof(*vd));
  747                 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
  748         }
  749 }
  750 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
  751 
  752 /*
  753  * Mark a mount point as busy. Used to synchronize access and to delay
  754  * unmounting. Eventually, mountlist_mtx is not released on failure.
  755  *
  756  * vfs_busy() is a custom lock, it can block the caller.
  757  * vfs_busy() only sleeps if the unmount is active on the mount point.
  758  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  759  * vnode belonging to mp.
  760  *
  761  * Lookup uses vfs_busy() to traverse mount points.
  762  * root fs                      var fs
  763  * / vnode lock         A       / vnode lock (/var)             D
  764  * /var vnode lock      B       /log vnode lock(/var/log)       E
  765  * vfs_busy lock        C       vfs_busy lock                   F
  766  *
  767  * Within each file system, the lock order is C->A->B and F->D->E.
  768  *
  769  * When traversing across mounts, the system follows that lock order:
  770  *
  771  *        C->A->B
  772  *              |
  773  *              +->F->D->E
  774  *
  775  * The lookup() process for namei("/var") illustrates the process:
  776  *  VOP_LOOKUP() obtains B while A is held
  777  *  vfs_busy() obtains a shared lock on F while A and B are held
  778  *  vput() releases lock on B
  779  *  vput() releases lock on A
  780  *  VFS_ROOT() obtains lock on D while shared lock on F is held
  781  *  vfs_unbusy() releases shared lock on F
  782  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  783  *    Attempt to lock A (instead of vp_crossmp) while D is held would
  784  *    violate the global order, causing deadlocks.
  785  *
  786  * dounmount() locks B while F is drained.
  787  */
  788 int
  789 vfs_busy(struct mount *mp, int flags)
  790 {
  791         struct mount_pcpu *mpcpu;
  792 
  793         MPASS((flags & ~MBF_MASK) == 0);
  794         CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
  795 
  796         if (vfs_op_thread_enter(mp, mpcpu)) {
  797                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
  798                 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
  799                 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
  800                 vfs_mp_count_add_pcpu(mpcpu, ref, 1);
  801                 vfs_mp_count_add_pcpu(mpcpu, lockref, 1);
  802                 vfs_op_thread_exit(mp, mpcpu);
  803                 if (flags & MBF_MNTLSTLOCK)
  804                         mtx_unlock(&mountlist_mtx);
  805                 return (0);
  806         }
  807 
  808         MNT_ILOCK(mp);
  809         vfs_assert_mount_counters(mp);
  810         MNT_REF(mp);
  811         /*
  812          * If mount point is currently being unmounted, sleep until the
  813          * mount point fate is decided.  If thread doing the unmounting fails,
  814          * it will clear MNTK_UNMOUNT flag before waking us up, indicating
  815          * that this mount point has survived the unmount attempt and vfs_busy
  816          * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
  817          * flag in addition to MNTK_UNMOUNT, indicating that mount point is
  818          * about to be really destroyed.  vfs_busy needs to release its
  819          * reference on the mount point in this case and return with ENOENT,
  820          * telling the caller that mount mount it tried to busy is no longer
  821          * valid.
  822          */
  823         while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
  824                 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
  825                         MNT_REL(mp);
  826                         MNT_IUNLOCK(mp);
  827                         CTR1(KTR_VFS, "%s: failed busying before sleeping",
  828                             __func__);
  829                         return (ENOENT);
  830                 }
  831                 if (flags & MBF_MNTLSTLOCK)
  832                         mtx_unlock(&mountlist_mtx);
  833                 mp->mnt_kern_flag |= MNTK_MWAIT;
  834                 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
  835                 if (flags & MBF_MNTLSTLOCK)
  836                         mtx_lock(&mountlist_mtx);
  837                 MNT_ILOCK(mp);
  838         }
  839         if (flags & MBF_MNTLSTLOCK)
  840                 mtx_unlock(&mountlist_mtx);
  841         mp->mnt_lockref++;
  842         MNT_IUNLOCK(mp);
  843         return (0);
  844 }
  845 
  846 /*
  847  * Free a busy filesystem.
  848  */
  849 void
  850 vfs_unbusy(struct mount *mp)
  851 {
  852         struct mount_pcpu *mpcpu;
  853         int c;
  854 
  855         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
  856 
  857         if (vfs_op_thread_enter(mp, mpcpu)) {
  858                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
  859                 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1);
  860                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
  861                 vfs_op_thread_exit(mp, mpcpu);
  862                 return;
  863         }
  864 
  865         MNT_ILOCK(mp);
  866         vfs_assert_mount_counters(mp);
  867         MNT_REL(mp);
  868         c = --mp->mnt_lockref;
  869         if (mp->mnt_vfs_ops == 0) {
  870                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
  871                 MNT_IUNLOCK(mp);
  872                 return;
  873         }
  874         if (c < 0)
  875                 vfs_dump_mount_counters(mp);
  876         if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
  877                 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
  878                 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
  879                 mp->mnt_kern_flag &= ~MNTK_DRAINING;
  880                 wakeup(&mp->mnt_lockref);
  881         }
  882         MNT_IUNLOCK(mp);
  883 }
  884 
  885 /*
  886  * Lookup a mount point by filesystem identifier.
  887  */
  888 struct mount *
  889 vfs_getvfs(fsid_t *fsid)
  890 {
  891         struct mount *mp;
  892 
  893         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
  894         mtx_lock(&mountlist_mtx);
  895         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  896                 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
  897                         vfs_ref(mp);
  898                         mtx_unlock(&mountlist_mtx);
  899                         return (mp);
  900                 }
  901         }
  902         mtx_unlock(&mountlist_mtx);
  903         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
  904         return ((struct mount *) 0);
  905 }
  906 
  907 /*
  908  * Lookup a mount point by filesystem identifier, busying it before
  909  * returning.
  910  *
  911  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  912  * cache for popular filesystem identifiers.  The cache is lockess, using
  913  * the fact that struct mount's are never freed.  In worst case we may
  914  * get pointer to unmounted or even different filesystem, so we have to
  915  * check what we got, and go slow way if so.
  916  */
  917 struct mount *
  918 vfs_busyfs(fsid_t *fsid)
  919 {
  920 #define FSID_CACHE_SIZE 256
  921         typedef struct mount * volatile vmp_t;
  922         static vmp_t cache[FSID_CACHE_SIZE];
  923         struct mount *mp;
  924         int error;
  925         uint32_t hash;
  926 
  927         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
  928         hash = fsid->val[0] ^ fsid->val[1];
  929         hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
  930         mp = cache[hash];
  931         if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
  932                 goto slow;
  933         if (vfs_busy(mp, 0) != 0) {
  934                 cache[hash] = NULL;
  935                 goto slow;
  936         }
  937         if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
  938                 return (mp);
  939         else
  940             vfs_unbusy(mp);
  941 
  942 slow:
  943         mtx_lock(&mountlist_mtx);
  944         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  945                 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
  946                         error = vfs_busy(mp, MBF_MNTLSTLOCK);
  947                         if (error) {
  948                                 cache[hash] = NULL;
  949                                 mtx_unlock(&mountlist_mtx);
  950                                 return (NULL);
  951                         }
  952                         cache[hash] = mp;
  953                         return (mp);
  954                 }
  955         }
  956         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
  957         mtx_unlock(&mountlist_mtx);
  958         return ((struct mount *) 0);
  959 }
  960 
  961 /*
  962  * Check if a user can access privileged mount options.
  963  */
  964 int
  965 vfs_suser(struct mount *mp, struct thread *td)
  966 {
  967         int error;
  968 
  969         if (jailed(td->td_ucred)) {
  970                 /*
  971                  * If the jail of the calling thread lacks permission for
  972                  * this type of file system, deny immediately.
  973                  */
  974                 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
  975                         return (EPERM);
  976 
  977                 /*
  978                  * If the file system was mounted outside the jail of the
  979                  * calling thread, deny immediately.
  980                  */
  981                 if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
  982                         return (EPERM);
  983         }
  984 
  985         /*
  986          * If file system supports delegated administration, we don't check
  987          * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
  988          * by the file system itself.
  989          * If this is not the user that did original mount, we check for
  990          * the PRIV_VFS_MOUNT_OWNER privilege.
  991          */
  992         if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
  993             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
  994                 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
  995                         return (error);
  996         }
  997         return (0);
  998 }
  999 
 1000 /*
 1001  * Get a new unique fsid.  Try to make its val[0] unique, since this value
 1002  * will be used to create fake device numbers for stat().  Also try (but
 1003  * not so hard) make its val[0] unique mod 2^16, since some emulators only
 1004  * support 16-bit device numbers.  We end up with unique val[0]'s for the
 1005  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
 1006  *
 1007  * Keep in mind that several mounts may be running in parallel.  Starting
 1008  * the search one past where the previous search terminated is both a
 1009  * micro-optimization and a defense against returning the same fsid to
 1010  * different mounts.
 1011  */
 1012 void
 1013 vfs_getnewfsid(struct mount *mp)
 1014 {
 1015         static uint16_t mntid_base;
 1016         struct mount *nmp;
 1017         fsid_t tfsid;
 1018         int mtype;
 1019 
 1020         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 1021         mtx_lock(&mntid_mtx);
 1022         mtype = mp->mnt_vfc->vfc_typenum;
 1023         tfsid.val[1] = mtype;
 1024         mtype = (mtype & 0xFF) << 24;
 1025         for (;;) {
 1026                 tfsid.val[0] = makedev(255,
 1027                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 1028                 mntid_base++;
 1029                 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 1030                         break;
 1031                 vfs_rel(nmp);
 1032         }
 1033         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 1034         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 1035         mtx_unlock(&mntid_mtx);
 1036 }
 1037 
 1038 /*
 1039  * Knob to control the precision of file timestamps:
 1040  *
 1041  *   0 = seconds only; nanoseconds zeroed.
 1042  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 1043  *   2 = seconds and nanoseconds, truncated to microseconds.
 1044  * >=3 = seconds and nanoseconds, maximum precision.
 1045  */
 1046 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 1047 
 1048 static int timestamp_precision = TSP_USEC;
 1049 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 1050     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
 1051     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
 1052     "3+: sec + ns (max. precision))");
 1053 
 1054 /*
 1055  * Get a current timestamp.
 1056  */
 1057 void
 1058 vfs_timestamp(struct timespec *tsp)
 1059 {
 1060         struct timeval tv;
 1061 
 1062         switch (timestamp_precision) {
 1063         case TSP_SEC:
 1064                 tsp->tv_sec = time_second;
 1065                 tsp->tv_nsec = 0;
 1066                 break;
 1067         case TSP_HZ:
 1068                 getnanotime(tsp);
 1069                 break;
 1070         case TSP_USEC:
 1071                 microtime(&tv);
 1072                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 1073                 break;
 1074         case TSP_NSEC:
 1075         default:
 1076                 nanotime(tsp);
 1077                 break;
 1078         }
 1079 }
 1080 
 1081 /*
 1082  * Set vnode attributes to VNOVAL
 1083  */
 1084 void
 1085 vattr_null(struct vattr *vap)
 1086 {
 1087 
 1088         vap->va_type = VNON;
 1089         vap->va_size = VNOVAL;
 1090         vap->va_bytes = VNOVAL;
 1091         vap->va_mode = VNOVAL;
 1092         vap->va_nlink = VNOVAL;
 1093         vap->va_uid = VNOVAL;
 1094         vap->va_gid = VNOVAL;
 1095         vap->va_fsid = VNOVAL;
 1096         vap->va_fileid = VNOVAL;
 1097         vap->va_blocksize = VNOVAL;
 1098         vap->va_rdev = VNOVAL;
 1099         vap->va_atime.tv_sec = VNOVAL;
 1100         vap->va_atime.tv_nsec = VNOVAL;
 1101         vap->va_mtime.tv_sec = VNOVAL;
 1102         vap->va_mtime.tv_nsec = VNOVAL;
 1103         vap->va_ctime.tv_sec = VNOVAL;
 1104         vap->va_ctime.tv_nsec = VNOVAL;
 1105         vap->va_birthtime.tv_sec = VNOVAL;
 1106         vap->va_birthtime.tv_nsec = VNOVAL;
 1107         vap->va_flags = VNOVAL;
 1108         vap->va_gen = VNOVAL;
 1109         vap->va_vaflags = 0;
 1110 }
 1111 
 1112 /*
 1113  * Try to reduce the total number of vnodes.
 1114  *
 1115  * This routine (and its user) are buggy in at least the following ways:
 1116  * - all parameters were picked years ago when RAM sizes were significantly
 1117  *   smaller
 1118  * - it can pick vnodes based on pages used by the vm object, but filesystems
 1119  *   like ZFS don't use it making the pick broken
 1120  * - since ZFS has its own aging policy it gets partially combated by this one
 1121  * - a dedicated method should be provided for filesystems to let them decide
 1122  *   whether the vnode should be recycled
 1123  *
 1124  * This routine is called when we have too many vnodes.  It attempts
 1125  * to free <count> vnodes and will potentially free vnodes that still
 1126  * have VM backing store (VM backing store is typically the cause
 1127  * of a vnode blowout so we want to do this).  Therefore, this operation
 1128  * is not considered cheap.
 1129  *
 1130  * A number of conditions may prevent a vnode from being reclaimed.
 1131  * the buffer cache may have references on the vnode, a directory
 1132  * vnode may still have references due to the namei cache representing
 1133  * underlying files, or the vnode may be in active use.   It is not
 1134  * desirable to reuse such vnodes.  These conditions may cause the
 1135  * number of vnodes to reach some minimum value regardless of what
 1136  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
 1137  *
 1138  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
 1139  *                       entries if this argument is strue
 1140  * @param trigger        Only reclaim vnodes with fewer than this many resident
 1141  *                       pages.
 1142  * @param target         How many vnodes to reclaim.
 1143  * @return               The number of vnodes that were reclaimed.
 1144  */
 1145 static int
 1146 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
 1147 {
 1148         struct vnode *vp, *mvp;
 1149         struct mount *mp;
 1150         struct vm_object *object;
 1151         u_long done;
 1152         bool retried;
 1153 
 1154         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1155 
 1156         retried = false;
 1157         done = 0;
 1158 
 1159         mvp = vnode_list_reclaim_marker;
 1160 restart:
 1161         vp = mvp;
 1162         while (done < target) {
 1163                 vp = TAILQ_NEXT(vp, v_vnodelist);
 1164                 if (__predict_false(vp == NULL))
 1165                         break;
 1166 
 1167                 if (__predict_false(vp->v_type == VMARKER))
 1168                         continue;
 1169 
 1170                 /*
 1171                  * If it's been deconstructed already, it's still
 1172                  * referenced, or it exceeds the trigger, skip it.
 1173                  * Also skip free vnodes.  We are trying to make space
 1174                  * to expand the free list, not reduce it.
 1175                  */
 1176                 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
 1177                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
 1178                         goto next_iter;
 1179 
 1180                 if (vp->v_type == VBAD || vp->v_type == VNON)
 1181                         goto next_iter;
 1182 
 1183                 object = atomic_load_ptr(&vp->v_object);
 1184                 if (object == NULL || object->resident_page_count > trigger) {
 1185                         goto next_iter;
 1186                 }
 1187 
 1188                 /*
 1189                  * Handle races against vnode allocation. Filesystems lock the
 1190                  * vnode some time after it gets returned from getnewvnode,
 1191                  * despite type and hold count being manipulated earlier.
 1192                  * Resorting to checking v_mount restores guarantees present
 1193                  * before the global list was reworked to contain all vnodes.
 1194                  */
 1195                 if (!VI_TRYLOCK(vp))
 1196                         goto next_iter;
 1197                 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 1198                         VI_UNLOCK(vp);
 1199                         goto next_iter;
 1200                 }
 1201                 if (vp->v_mount == NULL) {
 1202                         VI_UNLOCK(vp);
 1203                         goto next_iter;
 1204                 }
 1205                 vholdl(vp);
 1206                 VI_UNLOCK(vp);
 1207                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1208                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 1209                 mtx_unlock(&vnode_list_mtx);
 1210 
 1211                 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 1212                         vdrop_recycle(vp);
 1213                         goto next_iter_unlocked;
 1214                 }
 1215                 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
 1216                         vdrop_recycle(vp);
 1217                         vn_finished_write(mp);
 1218                         goto next_iter_unlocked;
 1219                 }
 1220 
 1221                 VI_LOCK(vp);
 1222                 if (vp->v_usecount > 0 ||
 1223                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 1224                     (vp->v_object != NULL && vp->v_object->handle == vp &&
 1225                     vp->v_object->resident_page_count > trigger)) {
 1226                         VOP_UNLOCK(vp);
 1227                         vdropl_recycle(vp);
 1228                         vn_finished_write(mp);
 1229                         goto next_iter_unlocked;
 1230                 }
 1231                 counter_u64_add(recycles_count, 1);
 1232                 vgonel(vp);
 1233                 VOP_UNLOCK(vp);
 1234                 vdropl_recycle(vp);
 1235                 vn_finished_write(mp);
 1236                 done++;
 1237 next_iter_unlocked:
 1238                 if (should_yield())
 1239                         kern_yield(PRI_USER);
 1240                 mtx_lock(&vnode_list_mtx);
 1241                 goto restart;
 1242 next_iter:
 1243                 MPASS(vp->v_type != VMARKER);
 1244                 if (!should_yield())
 1245                         continue;
 1246                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1247                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 1248                 mtx_unlock(&vnode_list_mtx);
 1249                 kern_yield(PRI_USER);
 1250                 mtx_lock(&vnode_list_mtx);
 1251                 goto restart;
 1252         }
 1253         if (done == 0 && !retried) {
 1254                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1255                 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
 1256                 retried = true;
 1257                 goto restart;
 1258         }
 1259         return (done);
 1260 }
 1261 
 1262 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
 1263 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
 1264     0,
 1265     "limit on vnode free requests per call to the vnlru_free routine");
 1266 
 1267 /*
 1268  * Attempt to reduce the free list by the requested amount.
 1269  */
 1270 static int
 1271 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 1272 {
 1273         struct vnode *vp;
 1274         struct mount *mp;
 1275         int ocount;
 1276 
 1277         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1278         if (count > max_vnlru_free)
 1279                 count = max_vnlru_free;
 1280         ocount = count;
 1281         vp = mvp;
 1282         for (;;) {
 1283                 if (count == 0) {
 1284                         break;
 1285                 }
 1286                 vp = TAILQ_NEXT(vp, v_vnodelist);
 1287                 if (__predict_false(vp == NULL)) {
 1288                         TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1289                         TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
 1290                         break;
 1291                 }
 1292                 if (__predict_false(vp->v_type == VMARKER))
 1293                         continue;
 1294                 if (vp->v_holdcnt > 0)
 1295                         continue;
 1296                 /*
 1297                  * Don't recycle if our vnode is from different type
 1298                  * of mount point.  Note that mp is type-safe, the
 1299                  * check does not reach unmapped address even if
 1300                  * vnode is reclaimed.
 1301                  */
 1302                 if (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
 1303                     mp->mnt_op != mnt_op) {
 1304                         continue;
 1305                 }
 1306                 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 1307                         continue;
 1308                 }
 1309                 if (!vhold_recycle_free(vp))
 1310                         continue;
 1311                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1312                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 1313                 mtx_unlock(&vnode_list_mtx);
 1314                 /*
 1315                  * FIXME: ignores the return value, meaning it may be nothing
 1316                  * got recycled but it claims otherwise to the caller.
 1317                  *
 1318                  * Originally the value started being ignored in 2005 with
 1319                  * 114a1006a8204aa156e1f9ad6476cdff89cada7f .
 1320                  *
 1321                  * Respecting the value can run into significant stalls if most
 1322                  * vnodes belong to one file system and it has writes
 1323                  * suspended.  In presence of many threads and millions of
 1324                  * vnodes they keep contending on the vnode_list_mtx lock only
 1325                  * to find vnodes they can't recycle.
 1326                  *
 1327                  * The solution would be to pre-check if the vnode is likely to
 1328                  * be recycle-able, but it needs to happen with the
 1329                  * vnode_list_mtx lock held. This runs into a problem where
 1330                  * VOP_GETWRITEMOUNT (currently needed to find out about if
 1331                  * writes are frozen) can take locks which LOR against it.
 1332                  *
 1333                  * Check nullfs for one example (null_getwritemount).
 1334                  */
 1335                 vtryrecycle(vp);
 1336                 count--;
 1337                 mtx_lock(&vnode_list_mtx);
 1338                 vp = mvp;
 1339         }
 1340         return (ocount - count);
 1341 }
 1342 
 1343 static int
 1344 vnlru_free_locked(int count)
 1345 {
 1346 
 1347         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1348         return (vnlru_free_impl(count, NULL, vnode_list_free_marker));
 1349 }
 1350 
 1351 void
 1352 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
 1353 {
 1354 
 1355         MPASS(mnt_op != NULL);
 1356         MPASS(mvp != NULL);
 1357         VNPASS(mvp->v_type == VMARKER, mvp);
 1358         mtx_lock(&vnode_list_mtx);
 1359         vnlru_free_impl(count, mnt_op, mvp);
 1360         mtx_unlock(&vnode_list_mtx);
 1361 }
 1362 
 1363 /*
 1364  * Temporary binary compat, don't use. Call vnlru_free_vfsops instead.
 1365  */
 1366 void
 1367 vnlru_free(int count, struct vfsops *mnt_op)
 1368 {
 1369         struct vnode *mvp;
 1370 
 1371         if (count == 0)
 1372                 return;
 1373         mtx_lock(&vnode_list_mtx);
 1374         mvp = vnode_list_free_marker;
 1375         if (vnlru_free_impl(count, mnt_op, mvp) == 0) {
 1376                 /*
 1377                  * It is possible the marker was moved over eligible vnodes by
 1378                  * callers which filtered by different ops. If so, start from
 1379                  * scratch.
 1380                  */
 1381                 if (vnlru_read_freevnodes() > 0) {
 1382                         TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1383                         TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
 1384                 }
 1385                 vnlru_free_impl(count, mnt_op, mvp);
 1386         }
 1387         mtx_unlock(&vnode_list_mtx);
 1388 }
 1389 
 1390 struct vnode *
 1391 vnlru_alloc_marker(void)
 1392 {
 1393         struct vnode *mvp;
 1394 
 1395         mvp = vn_alloc_marker(NULL);
 1396         mtx_lock(&vnode_list_mtx);
 1397         TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
 1398         mtx_unlock(&vnode_list_mtx);
 1399         return (mvp);
 1400 }
 1401 
 1402 void
 1403 vnlru_free_marker(struct vnode *mvp)
 1404 {
 1405         mtx_lock(&vnode_list_mtx);
 1406         TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1407         mtx_unlock(&vnode_list_mtx);
 1408         vn_free_marker(mvp);
 1409 }
 1410 
 1411 static void
 1412 vnlru_recalc(void)
 1413 {
 1414 
 1415         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1416         gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
 1417         vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
 1418         vlowat = vhiwat / 2;
 1419 }
 1420 
 1421 /*
 1422  * Attempt to recycle vnodes in a context that is always safe to block.
 1423  * Calling vlrurecycle() from the bowels of filesystem code has some
 1424  * interesting deadlock problems.
 1425  */
 1426 static struct proc *vnlruproc;
 1427 static int vnlruproc_sig;
 1428 
 1429 /*
 1430  * The main freevnodes counter is only updated when threads requeue their vnode
 1431  * batches. CPUs are conditionally walked to compute a more accurate total.
 1432  *
 1433  * Limit how much of a slop are we willing to tolerate. Note: the actual value
 1434  * at any given moment can still exceed slop, but it should not be by significant
 1435  * margin in practice.
 1436  */
 1437 #define VNLRU_FREEVNODES_SLOP 128
 1438 
 1439 static __inline void
 1440 vfs_freevnodes_inc(void)
 1441 {
 1442         struct vdbatch *vd;
 1443 
 1444         critical_enter();
 1445         vd = DPCPU_PTR(vd);
 1446         vd->freevnodes++;
 1447         critical_exit();
 1448 }
 1449 
 1450 static __inline void
 1451 vfs_freevnodes_dec(void)
 1452 {
 1453         struct vdbatch *vd;
 1454 
 1455         critical_enter();
 1456         vd = DPCPU_PTR(vd);
 1457         vd->freevnodes--;
 1458         critical_exit();
 1459 }
 1460 
 1461 static u_long
 1462 vnlru_read_freevnodes(void)
 1463 {
 1464         struct vdbatch *vd;
 1465         long slop;
 1466         int cpu;
 1467 
 1468         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1469         if (freevnodes > freevnodes_old)
 1470                 slop = freevnodes - freevnodes_old;
 1471         else
 1472                 slop = freevnodes_old - freevnodes;
 1473         if (slop < VNLRU_FREEVNODES_SLOP)
 1474                 return (freevnodes >= 0 ? freevnodes : 0);
 1475         freevnodes_old = freevnodes;
 1476         CPU_FOREACH(cpu) {
 1477                 vd = DPCPU_ID_PTR((cpu), vd);
 1478                 freevnodes_old += vd->freevnodes;
 1479         }
 1480         return (freevnodes_old >= 0 ? freevnodes_old : 0);
 1481 }
 1482 
 1483 static bool
 1484 vnlru_under(u_long rnumvnodes, u_long limit)
 1485 {
 1486         u_long rfreevnodes, space;
 1487 
 1488         if (__predict_false(rnumvnodes > desiredvnodes))
 1489                 return (true);
 1490 
 1491         space = desiredvnodes - rnumvnodes;
 1492         if (space < limit) {
 1493                 rfreevnodes = vnlru_read_freevnodes();
 1494                 if (rfreevnodes > wantfreevnodes)
 1495                         space += rfreevnodes - wantfreevnodes;
 1496         }
 1497         return (space < limit);
 1498 }
 1499 
 1500 static bool
 1501 vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
 1502 {
 1503         long rfreevnodes, space;
 1504 
 1505         if (__predict_false(rnumvnodes > desiredvnodes))
 1506                 return (true);
 1507 
 1508         space = desiredvnodes - rnumvnodes;
 1509         if (space < limit) {
 1510                 rfreevnodes = atomic_load_long(&freevnodes);
 1511                 if (rfreevnodes > wantfreevnodes)
 1512                         space += rfreevnodes - wantfreevnodes;
 1513         }
 1514         return (space < limit);
 1515 }
 1516 
 1517 static void
 1518 vnlru_kick(void)
 1519 {
 1520 
 1521         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1522         if (vnlruproc_sig == 0) {
 1523                 vnlruproc_sig = 1;
 1524                 wakeup(vnlruproc);
 1525         }
 1526 }
 1527 
 1528 static void
 1529 vnlru_proc(void)
 1530 {
 1531         u_long rnumvnodes, rfreevnodes, target;
 1532         unsigned long onumvnodes;
 1533         int done, force, trigger, usevnodes;
 1534         bool reclaim_nc_src, want_reread;
 1535 
 1536         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 1537             SHUTDOWN_PRI_FIRST);
 1538 
 1539         force = 0;
 1540         want_reread = false;
 1541         for (;;) {
 1542                 kproc_suspend_check(vnlruproc);
 1543                 mtx_lock(&vnode_list_mtx);
 1544                 rnumvnodes = atomic_load_long(&numvnodes);
 1545 
 1546                 if (want_reread) {
 1547                         force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
 1548                         want_reread = false;
 1549                 }
 1550 
 1551                 /*
 1552                  * If numvnodes is too large (due to desiredvnodes being
 1553                  * adjusted using its sysctl, or emergency growth), first
 1554                  * try to reduce it by discarding from the free list.
 1555                  */
 1556                 if (rnumvnodes > desiredvnodes) {
 1557                         vnlru_free_locked(rnumvnodes - desiredvnodes);
 1558                         rnumvnodes = atomic_load_long(&numvnodes);
 1559                 }
 1560                 /*
 1561                  * Sleep if the vnode cache is in a good state.  This is
 1562                  * when it is not over-full and has space for about a 4%
 1563                  * or 9% expansion (by growing its size or inexcessively
 1564                  * reducing its free list).  Otherwise, try to reclaim
 1565                  * space for a 10% expansion.
 1566                  */
 1567                 if (vstir && force == 0) {
 1568                         force = 1;
 1569                         vstir = 0;
 1570                 }
 1571                 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
 1572                         vnlruproc_sig = 0;
 1573                         wakeup(&vnlruproc_sig);
 1574                         msleep(vnlruproc, &vnode_list_mtx,
 1575                             PVFS|PDROP, "vlruwt", hz);
 1576                         continue;
 1577                 }
 1578                 rfreevnodes = vnlru_read_freevnodes();
 1579 
 1580                 onumvnodes = rnumvnodes;
 1581                 /*
 1582                  * Calculate parameters for recycling.  These are the same
 1583                  * throughout the loop to give some semblance of fairness.
 1584                  * The trigger point is to avoid recycling vnodes with lots
 1585                  * of resident pages.  We aren't trying to free memory; we
 1586                  * are trying to recycle or at least free vnodes.
 1587                  */
 1588                 if (rnumvnodes <= desiredvnodes)
 1589                         usevnodes = rnumvnodes - rfreevnodes;
 1590                 else
 1591                         usevnodes = rnumvnodes;
 1592                 if (usevnodes <= 0)
 1593                         usevnodes = 1;
 1594                 /*
 1595                  * The trigger value is chosen to give a conservatively
 1596                  * large value to ensure that it alone doesn't prevent
 1597                  * making progress.  The value can easily be so large that
 1598                  * it is effectively infinite in some congested and
 1599                  * misconfigured cases, and this is necessary.  Normally
 1600                  * it is about 8 to 100 (pages), which is quite large.
 1601                  */
 1602                 trigger = vm_cnt.v_page_count * 2 / usevnodes;
 1603                 if (force < 2)
 1604                         trigger = vsmalltrigger;
 1605                 reclaim_nc_src = force >= 3;
 1606                 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
 1607                 target = target / 10 + 1;
 1608                 done = vlrureclaim(reclaim_nc_src, trigger, target);
 1609                 mtx_unlock(&vnode_list_mtx);
 1610                 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
 1611                         uma_reclaim(UMA_RECLAIM_DRAIN);
 1612                 if (done == 0) {
 1613                         if (force == 0 || force == 1) {
 1614                                 force = 2;
 1615                                 continue;
 1616                         }
 1617                         if (force == 2) {
 1618                                 force = 3;
 1619                                 continue;
 1620                         }
 1621                         want_reread = true;
 1622                         force = 0;
 1623                         vnlru_nowhere++;
 1624                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 1625                 } else {
 1626                         want_reread = true;
 1627                         kern_yield(PRI_USER);
 1628                 }
 1629         }
 1630 }
 1631 
 1632 static struct kproc_desc vnlru_kp = {
 1633         "vnlru",
 1634         vnlru_proc,
 1635         &vnlruproc
 1636 };
 1637 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
 1638     &vnlru_kp);
 1639 
 1640 /*
 1641  * Routines having to do with the management of the vnode table.
 1642  */
 1643 
 1644 /*
 1645  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
 1646  * before we actually vgone().  This function must be called with the vnode
 1647  * held to prevent the vnode from being returned to the free list midway
 1648  * through vgone().
 1649  */
 1650 static int
 1651 vtryrecycle(struct vnode *vp)
 1652 {
 1653         struct mount *vnmp;
 1654 
 1655         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 1656         VNASSERT(vp->v_holdcnt, vp,
 1657             ("vtryrecycle: Recycling vp %p without a reference.", vp));
 1658         /*
 1659          * This vnode may found and locked via some other list, if so we
 1660          * can't recycle it yet.
 1661          */
 1662         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 1663                 CTR2(KTR_VFS,
 1664                     "%s: impossible to recycle, vp %p lock is already held",
 1665                     __func__, vp);
 1666                 vdrop_recycle(vp);
 1667                 return (EWOULDBLOCK);
 1668         }
 1669         /*
 1670          * Don't recycle if its filesystem is being suspended.
 1671          */
 1672         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 1673                 VOP_UNLOCK(vp);
 1674                 CTR2(KTR_VFS,
 1675                     "%s: impossible to recycle, cannot start the write for %p",
 1676                     __func__, vp);
 1677                 vdrop_recycle(vp);
 1678                 return (EBUSY);
 1679         }
 1680         /*
 1681          * If we got this far, we need to acquire the interlock and see if
 1682          * anyone picked up this vnode from another list.  If not, we will
 1683          * mark it with DOOMED via vgonel() so that anyone who does find it
 1684          * will skip over it.
 1685          */
 1686         VI_LOCK(vp);
 1687         if (vp->v_usecount) {
 1688                 VOP_UNLOCK(vp);
 1689                 vdropl_recycle(vp);
 1690                 vn_finished_write(vnmp);
 1691                 CTR2(KTR_VFS,
 1692                     "%s: impossible to recycle, %p is already referenced",
 1693                     __func__, vp);
 1694                 return (EBUSY);
 1695         }
 1696         if (!VN_IS_DOOMED(vp)) {
 1697                 counter_u64_add(recycles_free_count, 1);
 1698                 vgonel(vp);
 1699         }
 1700         VOP_UNLOCK(vp);
 1701         vdropl_recycle(vp);
 1702         vn_finished_write(vnmp);
 1703         return (0);
 1704 }
 1705 
 1706 /*
 1707  * Allocate a new vnode.
 1708  *
 1709  * The operation never returns an error. Returning an error was disabled
 1710  * in r145385 (dated 2005) with the following comment:
 1711  *
 1712  * XXX Not all VFS_VGET/ffs_vget callers check returns.
 1713  *
 1714  * Given the age of this commit (almost 15 years at the time of writing this
 1715  * comment) restoring the ability to fail requires a significant audit of
 1716  * all codepaths.
 1717  *
 1718  * The routine can try to free a vnode or stall for up to 1 second waiting for
 1719  * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
 1720  */
 1721 static u_long vn_alloc_cyclecount;
 1722 
 1723 static struct vnode * __noinline
 1724 vn_alloc_hard(struct mount *mp)
 1725 {
 1726         u_long rnumvnodes, rfreevnodes;
 1727 
 1728         mtx_lock(&vnode_list_mtx);
 1729         rnumvnodes = atomic_load_long(&numvnodes);
 1730         if (rnumvnodes + 1 < desiredvnodes) {
 1731                 vn_alloc_cyclecount = 0;
 1732                 goto alloc;
 1733         }
 1734         rfreevnodes = vnlru_read_freevnodes();
 1735         if (vn_alloc_cyclecount++ >= rfreevnodes) {
 1736                 vn_alloc_cyclecount = 0;
 1737                 vstir = 1;
 1738         }
 1739         /*
 1740          * Grow the vnode cache if it will not be above its target max
 1741          * after growing.  Otherwise, if the free list is nonempty, try
 1742          * to reclaim 1 item from it before growing the cache (possibly
 1743          * above its target max if the reclamation failed or is delayed).
 1744          * Otherwise, wait for some space.  In all cases, schedule
 1745          * vnlru_proc() if we are getting short of space.  The watermarks
 1746          * should be chosen so that we never wait or even reclaim from
 1747          * the free list to below its target minimum.
 1748          */
 1749         if (vnlru_free_locked(1) > 0)
 1750                 goto alloc;
 1751         if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 1752                 /*
 1753                  * Wait for space for a new vnode.
 1754                  */
 1755                 vnlru_kick();
 1756                 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 1757                 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
 1758                     vnlru_read_freevnodes() > 1)
 1759                         vnlru_free_locked(1);
 1760         }
 1761 alloc:
 1762         rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 1763         if (vnlru_under(rnumvnodes, vlowat))
 1764                 vnlru_kick();
 1765         mtx_unlock(&vnode_list_mtx);
 1766         return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 1767 }
 1768 
 1769 static struct vnode *
 1770 vn_alloc(struct mount *mp)
 1771 {
 1772         u_long rnumvnodes;
 1773 
 1774         if (__predict_false(vn_alloc_cyclecount != 0))
 1775                 return (vn_alloc_hard(mp));
 1776         rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 1777         if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
 1778                 atomic_subtract_long(&numvnodes, 1);
 1779                 return (vn_alloc_hard(mp));
 1780         }
 1781 
 1782         return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 1783 }
 1784 
 1785 static void
 1786 vn_free(struct vnode *vp)
 1787 {
 1788 
 1789         atomic_subtract_long(&numvnodes, 1);
 1790         uma_zfree_smr(vnode_zone, vp);
 1791 }
 1792 
 1793 /*
 1794  * Return the next vnode from the free list.
 1795  */
 1796 int
 1797 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 1798     struct vnode **vpp)
 1799 {
 1800         struct vnode *vp;
 1801         struct thread *td;
 1802         struct lock_object *lo;
 1803 
 1804         CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
 1805 
 1806         KASSERT(vops->registered,
 1807             ("%s: not registered vector op %p\n", __func__, vops));
 1808 
 1809         td = curthread;
 1810         if (td->td_vp_reserved != NULL) {
 1811                 vp = td->td_vp_reserved;
 1812                 td->td_vp_reserved = NULL;
 1813         } else {
 1814                 vp = vn_alloc(mp);
 1815         }
 1816         counter_u64_add(vnodes_created, 1);
 1817         /*
 1818          * Locks are given the generic name "vnode" when created.
 1819          * Follow the historic practice of using the filesystem
 1820          * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
 1821          *
 1822          * Locks live in a witness group keyed on their name. Thus,
 1823          * when a lock is renamed, it must also move from the witness
 1824          * group of its old name to the witness group of its new name.
 1825          *
 1826          * The change only needs to be made when the vnode moves
 1827          * from one filesystem type to another. We ensure that each
 1828          * filesystem use a single static name pointer for its tag so
 1829          * that we can compare pointers rather than doing a strcmp().
 1830          */
 1831         lo = &vp->v_vnlock->lock_object;
 1832 #ifdef WITNESS
 1833         if (lo->lo_name != tag) {
 1834 #endif
 1835                 lo->lo_name = tag;
 1836 #ifdef WITNESS
 1837                 WITNESS_DESTROY(lo);
 1838                 WITNESS_INIT(lo, tag);
 1839         }
 1840 #endif
 1841         /*
 1842          * By default, don't allow shared locks unless filesystems opt-in.
 1843          */
 1844         vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 1845         /*
 1846          * Finalize various vnode identity bits.
 1847          */
 1848         KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
 1849         KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
 1850         KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 1851         vp->v_type = VNON;
 1852         vp->v_op = vops;
 1853         vp->v_irflag = 0;
 1854         v_init_counters(vp);
 1855         vn_seqc_init(vp);
 1856         vp->v_bufobj.bo_ops = &buf_ops_bio;
 1857 #ifdef DIAGNOSTIC
 1858         if (mp == NULL && vops != &dead_vnodeops)
 1859                 printf("NULL mp in getnewvnode(9), tag %s\n", tag);
 1860 #endif
 1861 #ifdef MAC
 1862         mac_vnode_init(vp);
 1863         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 1864                 mac_vnode_associate_singlelabel(mp, vp);
 1865 #endif
 1866         if (mp != NULL) {
 1867                 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 1868                 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 1869                         vp->v_vflag |= VV_NOKNOTE;
 1870         }
 1871 
 1872         /*
 1873          * For the filesystems which do not use vfs_hash_insert(),
 1874          * still initialize v_hash to have vfs_hash_index() useful.
 1875          * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 1876          * its own hashing.
 1877          */
 1878         vp->v_hash = (uintptr_t)vp >> vnsz2log;
 1879 
 1880         *vpp = vp;
 1881         return (0);
 1882 }
 1883 
 1884 void
 1885 getnewvnode_reserve(void)
 1886 {
 1887         struct thread *td;
 1888 
 1889         td = curthread;
 1890         MPASS(td->td_vp_reserved == NULL);
 1891         td->td_vp_reserved = vn_alloc(NULL);
 1892 }
 1893 
 1894 void
 1895 getnewvnode_drop_reserve(void)
 1896 {
 1897         struct thread *td;
 1898 
 1899         td = curthread;
 1900         if (td->td_vp_reserved != NULL) {
 1901                 vn_free(td->td_vp_reserved);
 1902                 td->td_vp_reserved = NULL;
 1903         }
 1904 }
 1905 
 1906 static void __noinline
 1907 freevnode(struct vnode *vp)
 1908 {
 1909         struct bufobj *bo;
 1910 
 1911         /*
 1912          * The vnode has been marked for destruction, so free it.
 1913          *
 1914          * The vnode will be returned to the zone where it will
 1915          * normally remain until it is needed for another vnode. We
 1916          * need to cleanup (or verify that the cleanup has already
 1917          * been done) any residual data left from its current use
 1918          * so as not to contaminate the freshly allocated vnode.
 1919          */
 1920         CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 1921         /*
 1922          * Paired with vgone.
 1923          */
 1924         vn_seqc_write_end_free(vp);
 1925 
 1926         bo = &vp->v_bufobj;
 1927         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 1928         VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
 1929         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 1930         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 1931         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 1932         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 1933         VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 1934             ("clean blk trie not empty"));
 1935         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 1936         VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 1937             ("dirty blk trie not empty"));
 1938         VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 1939         VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 1940         VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
 1941         VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
 1942             ("Dangling rangelock waiters"));
 1943         VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
 1944             ("Leaked inactivation"));
 1945         VI_UNLOCK(vp);
 1946 #ifdef MAC
 1947         mac_vnode_destroy(vp);
 1948 #endif
 1949         if (vp->v_pollinfo != NULL) {
 1950                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1951                 destroy_vpollinfo(vp->v_pollinfo);
 1952                 VOP_UNLOCK(vp);
 1953                 vp->v_pollinfo = NULL;
 1954         }
 1955         vp->v_mountedhere = NULL;
 1956         vp->v_unpcb = NULL;
 1957         vp->v_rdev = NULL;
 1958         vp->v_fifoinfo = NULL;
 1959         vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 1960         vp->v_iflag = 0;
 1961         vp->v_vflag = 0;
 1962         bo->bo_flag = 0;
 1963         vn_free(vp);
 1964 }
 1965 
 1966 /*
 1967  * Delete from old mount point vnode list, if on one.
 1968  */
 1969 static void
 1970 delmntque(struct vnode *vp)
 1971 {
 1972         struct mount *mp;
 1973 
 1974         VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 1975 
 1976         mp = vp->v_mount;
 1977         if (mp == NULL)
 1978                 return;
 1979         MNT_ILOCK(mp);
 1980         VI_LOCK(vp);
 1981         vp->v_mount = NULL;
 1982         VI_UNLOCK(vp);
 1983         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 1984                 ("bad mount point vnode list size"));
 1985         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 1986         mp->mnt_nvnodelistsize--;
 1987         MNT_REL(mp);
 1988         MNT_IUNLOCK(mp);
 1989 }
 1990 
 1991 static void
 1992 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
 1993 {
 1994 
 1995         vp->v_data = NULL;
 1996         vp->v_op = &dead_vnodeops;
 1997         vgone(vp);
 1998         vput(vp);
 1999 }
 2000 
 2001 /*
 2002  * Insert into list of vnodes for the new mount point, if available.
 2003  */
 2004 int
 2005 insmntque1(struct vnode *vp, struct mount *mp,
 2006         void (*dtr)(struct vnode *, void *), void *dtr_arg)
 2007 {
 2008 
 2009         KASSERT(vp->v_mount == NULL,
 2010                 ("insmntque: vnode already on per mount vnode list"));
 2011         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 2012         ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 2013 
 2014         /*
 2015          * We acquire the vnode interlock early to ensure that the
 2016          * vnode cannot be recycled by another process releasing a
 2017          * holdcnt on it before we get it on both the vnode list
 2018          * and the active vnode list. The mount mutex protects only
 2019          * manipulation of the vnode list and the vnode freelist
 2020          * mutex protects only manipulation of the active vnode list.
 2021          * Hence the need to hold the vnode interlock throughout.
 2022          */
 2023         MNT_ILOCK(mp);
 2024         VI_LOCK(vp);
 2025         if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
 2026             ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 2027             mp->mnt_nvnodelistsize == 0)) &&
 2028             (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 2029                 VI_UNLOCK(vp);
 2030                 MNT_IUNLOCK(mp);
 2031                 if (dtr != NULL)
 2032                         dtr(vp, dtr_arg);
 2033                 return (EBUSY);
 2034         }
 2035         vp->v_mount = mp;
 2036         MNT_REF(mp);
 2037         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 2038         VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 2039                 ("neg mount point vnode list size"));
 2040         mp->mnt_nvnodelistsize++;
 2041         VI_UNLOCK(vp);
 2042         MNT_IUNLOCK(mp);
 2043         return (0);
 2044 }
 2045 
 2046 int
 2047 insmntque(struct vnode *vp, struct mount *mp)
 2048 {
 2049 
 2050         return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 2051 }
 2052 
 2053 /*
 2054  * Flush out and invalidate all buffers associated with a bufobj
 2055  * Called with the underlying object locked.
 2056  */
 2057 int
 2058 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 2059 {
 2060         int error;
 2061 
 2062         BO_LOCK(bo);
 2063         if (flags & V_SAVE) {
 2064                 error = bufobj_wwait(bo, slpflag, slptimeo);
 2065                 if (error) {
 2066                         BO_UNLOCK(bo);
 2067                         return (error);
 2068                 }
 2069                 if (bo->bo_dirty.bv_cnt > 0) {
 2070                         BO_UNLOCK(bo);
 2071                         do {
 2072                                 error = BO_SYNC(bo, MNT_WAIT);
 2073                         } while (error == ERELOOKUP);
 2074                         if (error != 0)
 2075                                 return (error);
 2076                         BO_LOCK(bo);
 2077                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
 2078                                 BO_UNLOCK(bo);
 2079                                 return (EBUSY);
 2080                         }
 2081                 }
 2082         }
 2083         /*
 2084          * If you alter this loop please notice that interlock is dropped and
 2085          * reacquired in flushbuflist.  Special care is needed to ensure that
 2086          * no race conditions occur from this.
 2087          */
 2088         do {
 2089                 error = flushbuflist(&bo->bo_clean,
 2090                     flags, bo, slpflag, slptimeo);
 2091                 if (error == 0 && !(flags & V_CLEANONLY))
 2092                         error = flushbuflist(&bo->bo_dirty,
 2093                             flags, bo, slpflag, slptimeo);
 2094                 if (error != 0 && error != EAGAIN) {
 2095                         BO_UNLOCK(bo);
 2096                         return (error);
 2097                 }
 2098         } while (error != 0);
 2099 
 2100         /*
 2101          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 2102          * have write I/O in-progress but if there is a VM object then the
 2103          * VM object can also have read-I/O in-progress.
 2104          */
 2105         do {
 2106                 bufobj_wwait(bo, 0, 0);
 2107                 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
 2108                         BO_UNLOCK(bo);
 2109                         vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
 2110                         BO_LOCK(bo);
 2111                 }
 2112         } while (bo->bo_numoutput > 0);
 2113         BO_UNLOCK(bo);
 2114 
 2115         /*
 2116          * Destroy the copy in the VM cache, too.
 2117          */
 2118         if (bo->bo_object != NULL &&
 2119             (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
 2120                 VM_OBJECT_WLOCK(bo->bo_object);
 2121                 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 2122                     OBJPR_CLEANONLY : 0);
 2123                 VM_OBJECT_WUNLOCK(bo->bo_object);
 2124         }
 2125 
 2126 #ifdef INVARIANTS
 2127         BO_LOCK(bo);
 2128         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
 2129             V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
 2130             bo->bo_clean.bv_cnt > 0))
 2131                 panic("vinvalbuf: flush failed");
 2132         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
 2133             bo->bo_dirty.bv_cnt > 0)
 2134                 panic("vinvalbuf: flush dirty failed");
 2135         BO_UNLOCK(bo);
 2136 #endif
 2137         return (0);
 2138 }
 2139 
 2140 /*
 2141  * Flush out and invalidate all buffers associated with a vnode.
 2142  * Called with the underlying object locked.
 2143  */
 2144 int
 2145 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 2146 {
 2147 
 2148         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 2149         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 2150         if (vp->v_object != NULL && vp->v_object->handle != vp)
 2151                 return (0);
 2152         return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 2153 }
 2154 
 2155 /*
 2156  * Flush out buffers on the specified list.
 2157  *
 2158  */
 2159 static int
 2160 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
 2161     int slptimeo)
 2162 {
 2163         struct buf *bp, *nbp;
 2164         int retval, error;
 2165         daddr_t lblkno;
 2166         b_xflags_t xflags;
 2167 
 2168         ASSERT_BO_WLOCKED(bo);
 2169 
 2170         retval = 0;
 2171         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 2172                 /*
 2173                  * If we are flushing both V_NORMAL and V_ALT buffers then
 2174                  * do not skip any buffers. If we are flushing only V_NORMAL
 2175                  * buffers then skip buffers marked as BX_ALTDATA. If we are
 2176                  * flushing only V_ALT buffers then skip buffers not marked
 2177                  * as BX_ALTDATA.
 2178                  */
 2179                 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
 2180                    (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
 2181                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
 2182                         continue;
 2183                 }
 2184                 if (nbp != NULL) {
 2185                         lblkno = nbp->b_lblkno;
 2186                         xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 2187                 }
 2188                 retval = EAGAIN;
 2189                 error = BUF_TIMELOCK(bp,
 2190                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 2191                     "flushbuf", slpflag, slptimeo);
 2192                 if (error) {
 2193                         BO_LOCK(bo);
 2194                         return (error != ENOLCK ? error : EAGAIN);
 2195                 }
 2196                 KASSERT(bp->b_bufobj == bo,
 2197                     ("bp %p wrong b_bufobj %p should be %p",
 2198                     bp, bp->b_bufobj, bo));
 2199                 /*
 2200                  * XXX Since there are no node locks for NFS, I
 2201                  * believe there is a slight chance that a delayed
 2202                  * write will occur while sleeping just above, so
 2203                  * check for it.
 2204                  */
 2205                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 2206                     (flags & V_SAVE)) {
 2207                         bremfree(bp);
 2208                         bp->b_flags |= B_ASYNC;
 2209                         bwrite(bp);
 2210                         BO_LOCK(bo);
 2211                         return (EAGAIN);        /* XXX: why not loop ? */
 2212                 }
 2213                 bremfree(bp);
 2214                 bp->b_flags |= (B_INVAL | B_RELBUF);
 2215                 bp->b_flags &= ~B_ASYNC;
 2216                 brelse(bp);
 2217                 BO_LOCK(bo);
 2218                 if (nbp == NULL)
 2219                         break;
 2220                 nbp = gbincore(bo, lblkno);
 2221                 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 2222                     != xflags)
 2223                         break;                  /* nbp invalid */
 2224         }
 2225         return (retval);
 2226 }
 2227 
 2228 int
 2229 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 2230 {
 2231         struct buf *bp;
 2232         int error;
 2233         daddr_t lblkno;
 2234 
 2235         ASSERT_BO_LOCKED(bo);
 2236 
 2237         for (lblkno = startn;;) {
 2238 again:
 2239                 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
 2240                 if (bp == NULL || bp->b_lblkno >= endn ||
 2241                     bp->b_lblkno < startn)
 2242                         break;
 2243                 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 2244                     LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
 2245                 if (error != 0) {
 2246                         BO_RLOCK(bo);
 2247                         if (error == ENOLCK)
 2248                                 goto again;
 2249                         return (error);
 2250                 }
 2251                 KASSERT(bp->b_bufobj == bo,
 2252                     ("bp %p wrong b_bufobj %p should be %p",
 2253                     bp, bp->b_bufobj, bo));
 2254                 lblkno = bp->b_lblkno + 1;
 2255                 if ((bp->b_flags & B_MANAGED) == 0)
 2256                         bremfree(bp);
 2257                 bp->b_flags |= B_RELBUF;
 2258                 /*
 2259                  * In the VMIO case, use the B_NOREUSE flag to hint that the
 2260                  * pages backing each buffer in the range are unlikely to be
 2261                  * reused.  Dirty buffers will have the hint applied once
 2262                  * they've been written.
 2263                  */
 2264                 if ((bp->b_flags & B_VMIO) != 0)
 2265                         bp->b_flags |= B_NOREUSE;
 2266                 brelse(bp);
 2267                 BO_RLOCK(bo);
 2268         }
 2269         return (0);
 2270 }
 2271 
 2272 /*
 2273  * Truncate a file's buffer and pages to a specified length.  This
 2274  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
 2275  * sync activity.
 2276  */
 2277 int
 2278 vtruncbuf(struct vnode *vp, off_t length, int blksize)
 2279 {
 2280         struct buf *bp, *nbp;
 2281         struct bufobj *bo;
 2282         daddr_t startlbn;
 2283 
 2284         CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
 2285             vp, blksize, (uintmax_t)length);
 2286 
 2287         /*
 2288          * Round up to the *next* lbn.
 2289          */
 2290         startlbn = howmany(length, blksize);
 2291 
 2292         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 2293 
 2294         bo = &vp->v_bufobj;
 2295 restart_unlocked:
 2296         BO_LOCK(bo);
 2297 
 2298         while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
 2299                 ;
 2300 
 2301         if (length > 0) {
 2302 restartsync:
 2303                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 2304                         if (bp->b_lblkno > 0)
 2305                                 continue;
 2306                         /*
 2307                          * Since we hold the vnode lock this should only
 2308                          * fail if we're racing with the buf daemon.
 2309                          */
 2310                         if (BUF_LOCK(bp,
 2311                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 2312                             BO_LOCKPTR(bo)) == ENOLCK)
 2313                                 goto restart_unlocked;
 2314 
 2315                         VNASSERT((bp->b_flags & B_DELWRI), vp,
 2316                             ("buf(%p) on dirty queue without DELWRI", bp));
 2317 
 2318                         bremfree(bp);
 2319                         bawrite(bp);
 2320                         BO_LOCK(bo);
 2321                         goto restartsync;
 2322                 }
 2323         }
 2324 
 2325         bufobj_wwait(bo, 0, 0);
 2326         BO_UNLOCK(bo);
 2327         vnode_pager_setsize(vp, length);
 2328 
 2329         return (0);
 2330 }
 2331 
 2332 /*
 2333  * Invalidate the cached pages of a file's buffer within the range of block
 2334  * numbers [startlbn, endlbn).
 2335  */
 2336 void
 2337 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
 2338     int blksize)
 2339 {
 2340         struct bufobj *bo;
 2341         off_t start, end;
 2342 
 2343         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
 2344 
 2345         start = blksize * startlbn;
 2346         end = blksize * endlbn;
 2347 
 2348         bo = &vp->v_bufobj;
 2349         BO_LOCK(bo);
 2350         MPASS(blksize == bo->bo_bsize);
 2351 
 2352         while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
 2353                 ;
 2354 
 2355         BO_UNLOCK(bo);
 2356         vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
 2357 }
 2358 
 2359 static int
 2360 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 2361     daddr_t startlbn, daddr_t endlbn)
 2362 {
 2363         struct buf *bp, *nbp;
 2364         bool anyfreed;
 2365 
 2366         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
 2367         ASSERT_BO_LOCKED(bo);
 2368 
 2369         do {
 2370                 anyfreed = false;
 2371                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 2372                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 2373                                 continue;
 2374                         if (BUF_LOCK(bp,
 2375                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 2376                             BO_LOCKPTR(bo)) == ENOLCK) {
 2377                                 BO_LOCK(bo);
 2378                                 return (EAGAIN);
 2379                         }
 2380 
 2381                         bremfree(bp);
 2382                         bp->b_flags |= B_INVAL | B_RELBUF;
 2383                         bp->b_flags &= ~B_ASYNC;
 2384                         brelse(bp);
 2385                         anyfreed = true;
 2386 
 2387                         BO_LOCK(bo);
 2388                         if (nbp != NULL &&
 2389                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 2390                             nbp->b_vp != vp ||
 2391                             (nbp->b_flags & B_DELWRI) != 0))
 2392                                 return (EAGAIN);
 2393                 }
 2394 
 2395                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 2396                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 2397                                 continue;
 2398                         if (BUF_LOCK(bp,
 2399                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 2400                             BO_LOCKPTR(bo)) == ENOLCK) {
 2401                                 BO_LOCK(bo);
 2402                                 return (EAGAIN);
 2403                         }
 2404                         bremfree(bp);
 2405                         bp->b_flags |= B_INVAL | B_RELBUF;
 2406                         bp->b_flags &= ~B_ASYNC;
 2407                         brelse(bp);
 2408                         anyfreed = true;
 2409 
 2410                         BO_LOCK(bo);
 2411                         if (nbp != NULL &&
 2412                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 2413                             (nbp->b_vp != vp) ||
 2414                             (nbp->b_flags & B_DELWRI) == 0))
 2415                                 return (EAGAIN);
 2416                 }
 2417         } while (anyfreed);
 2418         return (0);
 2419 }
 2420 
 2421 static void
 2422 buf_vlist_remove(struct buf *bp)
 2423 {
 2424         struct bufv *bv;
 2425         b_xflags_t flags;
 2426 
 2427         flags = bp->b_xflags;
 2428 
 2429         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 2430         ASSERT_BO_WLOCKED(bp->b_bufobj);
 2431         KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
 2432             (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
 2433             ("%s: buffer %p has invalid queue state", __func__, bp));
 2434 
 2435         if ((flags & BX_VNDIRTY) != 0)
 2436                 bv = &bp->b_bufobj->bo_dirty;
 2437         else
 2438                 bv = &bp->b_bufobj->bo_clean;
 2439         BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 2440         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 2441         bv->bv_cnt--;
 2442         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 2443 }
 2444 
 2445 /*
 2446  * Add the buffer to the sorted clean or dirty block list.
 2447  *
 2448  * NOTE: xflags is passed as a constant, optimizing this inline function!
 2449  */
 2450 static void
 2451 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 2452 {
 2453         struct bufv *bv;
 2454         struct buf *n;
 2455         int error;
 2456 
 2457         ASSERT_BO_WLOCKED(bo);
 2458         KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
 2459             ("buf_vlist_add: bo %p does not allow bufs", bo));
 2460         KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 2461             ("dead bo %p", bo));
 2462         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 2463             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 2464         bp->b_xflags |= xflags;
 2465         if (xflags & BX_VNDIRTY)
 2466                 bv = &bo->bo_dirty;
 2467         else
 2468                 bv = &bo->bo_clean;
 2469 
 2470         /*
 2471          * Keep the list ordered.  Optimize empty list insertion.  Assume
 2472          * we tend to grow at the tail so lookup_le should usually be cheaper
 2473          * than _ge. 
 2474          */
 2475         if (bv->bv_cnt == 0 ||
 2476             bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 2477                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 2478         else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 2479                 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 2480         else
 2481                 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 2482         error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 2483         if (error)
 2484                 panic("buf_vlist_add:  Preallocated nodes insufficient.");
 2485         bv->bv_cnt++;
 2486 }
 2487 
 2488 /*
 2489  * Look up a buffer using the buffer tries.
 2490  */
 2491 struct buf *
 2492 gbincore(struct bufobj *bo, daddr_t lblkno)
 2493 {
 2494         struct buf *bp;
 2495 
 2496         ASSERT_BO_LOCKED(bo);
 2497         bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 2498         if (bp != NULL)
 2499                 return (bp);
 2500         return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
 2501 }
 2502 
 2503 /*
 2504  * Look up a buf using the buffer tries, without the bufobj lock.  This relies
 2505  * on SMR for safe lookup, and bufs being in a no-free zone to provide type
 2506  * stability of the result.  Like other lockless lookups, the found buf may
 2507  * already be invalid by the time this function returns.
 2508  */
 2509 struct buf *
 2510 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
 2511 {
 2512         struct buf *bp;
 2513 
 2514         ASSERT_BO_UNLOCKED(bo);
 2515         bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
 2516         if (bp != NULL)
 2517                 return (bp);
 2518         return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
 2519 }
 2520 
 2521 /*
 2522  * Associate a buffer with a vnode.
 2523  */
 2524 void
 2525 bgetvp(struct vnode *vp, struct buf *bp)
 2526 {
 2527         struct bufobj *bo;
 2528 
 2529         bo = &vp->v_bufobj;
 2530         ASSERT_BO_WLOCKED(bo);
 2531         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 2532 
 2533         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 2534         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 2535             ("bgetvp: bp already attached! %p", bp));
 2536 
 2537         vhold(vp);
 2538         bp->b_vp = vp;
 2539         bp->b_bufobj = bo;
 2540         /*
 2541          * Insert onto list for new vnode.
 2542          */
 2543         buf_vlist_add(bp, bo, BX_VNCLEAN);
 2544 }
 2545 
 2546 /*
 2547  * Disassociate a buffer from a vnode.
 2548  */
 2549 void
 2550 brelvp(struct buf *bp)
 2551 {
 2552         struct bufobj *bo;
 2553         struct vnode *vp;
 2554 
 2555         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 2556         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 2557 
 2558         /*
 2559          * Delete from old vnode list, if on one.
 2560          */
 2561         vp = bp->b_vp;          /* XXX */
 2562         bo = bp->b_bufobj;
 2563         BO_LOCK(bo);
 2564         buf_vlist_remove(bp);
 2565         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 2566                 bo->bo_flag &= ~BO_ONWORKLST;
 2567                 mtx_lock(&sync_mtx);
 2568                 LIST_REMOVE(bo, bo_synclist);
 2569                 syncer_worklist_len--;
 2570                 mtx_unlock(&sync_mtx);
 2571         }
 2572         bp->b_vp = NULL;
 2573         bp->b_bufobj = NULL;
 2574         BO_UNLOCK(bo);
 2575         vdrop(vp);
 2576 }
 2577 
 2578 /*
 2579  * Add an item to the syncer work queue.
 2580  */
 2581 static void
 2582 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 2583 {
 2584         int slot;
 2585 
 2586         ASSERT_BO_WLOCKED(bo);
 2587 
 2588         mtx_lock(&sync_mtx);
 2589         if (bo->bo_flag & BO_ONWORKLST)
 2590                 LIST_REMOVE(bo, bo_synclist);
 2591         else {
 2592                 bo->bo_flag |= BO_ONWORKLST;
 2593                 syncer_worklist_len++;
 2594         }
 2595 
 2596         if (delay > syncer_maxdelay - 2)
 2597                 delay = syncer_maxdelay - 2;
 2598         slot = (syncer_delayno + delay) & syncer_mask;
 2599 
 2600         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 2601         mtx_unlock(&sync_mtx);
 2602 }
 2603 
 2604 static int
 2605 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 2606 {
 2607         int error, len;
 2608 
 2609         mtx_lock(&sync_mtx);
 2610         len = syncer_worklist_len - sync_vnode_count;
 2611         mtx_unlock(&sync_mtx);
 2612         error = SYSCTL_OUT(req, &len, sizeof(len));
 2613         return (error);
 2614 }
 2615 
 2616 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
 2617     CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
 2618     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 2619 
 2620 static struct proc *updateproc;
 2621 static void sched_sync(void);
 2622 static struct kproc_desc up_kp = {
 2623         "syncer",
 2624         sched_sync,
 2625         &updateproc
 2626 };
 2627 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 2628 
 2629 static int
 2630 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 2631 {
 2632         struct vnode *vp;
 2633         struct mount *mp;
 2634 
 2635         *bo = LIST_FIRST(slp);
 2636         if (*bo == NULL)
 2637                 return (0);
 2638         vp = bo2vnode(*bo);
 2639         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 2640                 return (1);
 2641         /*
 2642          * We use vhold in case the vnode does not
 2643          * successfully sync.  vhold prevents the vnode from
 2644          * going away when we unlock the sync_mtx so that
 2645          * we can acquire the vnode interlock.
 2646          */
 2647         vholdl(vp);
 2648         mtx_unlock(&sync_mtx);
 2649         VI_UNLOCK(vp);
 2650         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 2651                 vdrop(vp);
 2652                 mtx_lock(&sync_mtx);
 2653                 return (*bo == LIST_FIRST(slp));
 2654         }
 2655         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2656         (void) VOP_FSYNC(vp, MNT_LAZY, td);
 2657         VOP_UNLOCK(vp);
 2658         vn_finished_write(mp);
 2659         BO_LOCK(*bo);
 2660         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 2661                 /*
 2662                  * Put us back on the worklist.  The worklist
 2663                  * routine will remove us from our current
 2664                  * position and then add us back in at a later
 2665                  * position.
 2666                  */
 2667                 vn_syncer_add_to_worklist(*bo, syncdelay);
 2668         }
 2669         BO_UNLOCK(*bo);
 2670         vdrop(vp);
 2671         mtx_lock(&sync_mtx);
 2672         return (0);
 2673 }
 2674 
 2675 static int first_printf = 1;
 2676 
 2677 /*
 2678  * System filesystem synchronizer daemon.
 2679  */
 2680 static void
 2681 sched_sync(void)
 2682 {
 2683         struct synclist *next, *slp;
 2684         struct bufobj *bo;
 2685         long starttime;
 2686         struct thread *td = curthread;
 2687         int last_work_seen;
 2688         int net_worklist_len;
 2689         int syncer_final_iter;
 2690         int error;
 2691 
 2692         last_work_seen = 0;
 2693         syncer_final_iter = 0;
 2694         syncer_state = SYNCER_RUNNING;
 2695         starttime = time_uptime;
 2696         td->td_pflags |= TDP_NORUNNINGBUF;
 2697 
 2698         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 2699             SHUTDOWN_PRI_LAST);
 2700 
 2701         mtx_lock(&sync_mtx);
 2702         for (;;) {
 2703                 if (syncer_state == SYNCER_FINAL_DELAY &&
 2704                     syncer_final_iter == 0) {
 2705                         mtx_unlock(&sync_mtx);
 2706                         kproc_suspend_check(td->td_proc);
 2707                         mtx_lock(&sync_mtx);
 2708                 }
 2709                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
 2710                 if (syncer_state != SYNCER_RUNNING &&
 2711                     starttime != time_uptime) {
 2712                         if (first_printf) {
 2713                                 printf("\nSyncing disks, vnodes remaining... ");
 2714                                 first_printf = 0;
 2715                         }
 2716                         printf("%d ", net_worklist_len);
 2717                 }
 2718                 starttime = time_uptime;
 2719 
 2720                 /*
 2721                  * Push files whose dirty time has expired.  Be careful
 2722                  * of interrupt race on slp queue.
 2723                  *
 2724                  * Skip over empty worklist slots when shutting down.
 2725                  */
 2726                 do {
 2727                         slp = &syncer_workitem_pending[syncer_delayno];
 2728                         syncer_delayno += 1;
 2729                         if (syncer_delayno == syncer_maxdelay)
 2730                                 syncer_delayno = 0;
 2731                         next = &syncer_workitem_pending[syncer_delayno];
 2732                         /*
 2733                          * If the worklist has wrapped since the
 2734                          * it was emptied of all but syncer vnodes,
 2735                          * switch to the FINAL_DELAY state and run
 2736                          * for one more second.
 2737                          */
 2738                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
 2739                             net_worklist_len == 0 &&
 2740                             last_work_seen == syncer_delayno) {
 2741                                 syncer_state = SYNCER_FINAL_DELAY;
 2742                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 2743                         }
 2744                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 2745                     syncer_worklist_len > 0);
 2746 
 2747                 /*
 2748                  * Keep track of the last time there was anything
 2749                  * on the worklist other than syncer vnodes.
 2750                  * Return to the SHUTTING_DOWN state if any
 2751                  * new work appears.
 2752                  */
 2753                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 2754                         last_work_seen = syncer_delayno;
 2755                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 2756                         syncer_state = SYNCER_SHUTTING_DOWN;
 2757                 while (!LIST_EMPTY(slp)) {
 2758                         error = sync_vnode(slp, &bo, td);
 2759                         if (error == 1) {
 2760                                 LIST_REMOVE(bo, bo_synclist);
 2761                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
 2762                                 continue;
 2763                         }
 2764 
 2765                         if (first_printf == 0) {
 2766                                 /*
 2767                                  * Drop the sync mutex, because some watchdog
 2768                                  * drivers need to sleep while patting
 2769                                  */
 2770                                 mtx_unlock(&sync_mtx);
 2771                                 wdog_kern_pat(WD_LASTVAL);
 2772                                 mtx_lock(&sync_mtx);
 2773                         }
 2774                 }
 2775                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 2776                         syncer_final_iter--;
 2777                 /*
 2778                  * The variable rushjob allows the kernel to speed up the
 2779                  * processing of the filesystem syncer process. A rushjob
 2780                  * value of N tells the filesystem syncer to process the next
 2781                  * N seconds worth of work on its queue ASAP. Currently rushjob
 2782                  * is used by the soft update code to speed up the filesystem
 2783                  * syncer process when the incore state is getting so far
 2784                  * ahead of the disk that the kernel memory pool is being
 2785                  * threatened with exhaustion.
 2786                  */
 2787                 if (rushjob > 0) {
 2788                         rushjob -= 1;
 2789                         continue;
 2790                 }
 2791                 /*
 2792                  * Just sleep for a short period of time between
 2793                  * iterations when shutting down to allow some I/O
 2794                  * to happen.
 2795                  *
 2796                  * If it has taken us less than a second to process the
 2797                  * current work, then wait. Otherwise start right over
 2798                  * again. We can still lose time if any single round
 2799                  * takes more than two seconds, but it does not really
 2800                  * matter as we are just trying to generally pace the
 2801                  * filesystem activity.
 2802                  */
 2803                 if (syncer_state != SYNCER_RUNNING ||
 2804                     time_uptime == starttime) {
 2805                         thread_lock(td);
 2806                         sched_prio(td, PPAUSE);
 2807                         thread_unlock(td);
 2808                 }
 2809                 if (syncer_state != SYNCER_RUNNING)
 2810                         cv_timedwait(&sync_wakeup, &sync_mtx,
 2811                             hz / SYNCER_SHUTDOWN_SPEEDUP);
 2812                 else if (time_uptime == starttime)
 2813                         cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 2814         }
 2815 }
 2816 
 2817 /*
 2818  * Request the syncer daemon to speed up its work.
 2819  * We never push it to speed up more than half of its
 2820  * normal turn time, otherwise it could take over the cpu.
 2821  */
 2822 int
 2823 speedup_syncer(void)
 2824 {
 2825         int ret = 0;
 2826 
 2827         mtx_lock(&sync_mtx);
 2828         if (rushjob < syncdelay / 2) {
 2829                 rushjob += 1;
 2830                 stat_rush_requests += 1;
 2831                 ret = 1;
 2832         }
 2833         mtx_unlock(&sync_mtx);
 2834         cv_broadcast(&sync_wakeup);
 2835         return (ret);
 2836 }
 2837 
 2838 /*
 2839  * Tell the syncer to speed up its work and run though its work
 2840  * list several times, then tell it to shut down.
 2841  */
 2842 static void
 2843 syncer_shutdown(void *arg, int howto)
 2844 {
 2845 
 2846         if (howto & RB_NOSYNC)
 2847                 return;
 2848         mtx_lock(&sync_mtx);
 2849         syncer_state = SYNCER_SHUTTING_DOWN;
 2850         rushjob = 0;
 2851         mtx_unlock(&sync_mtx);
 2852         cv_broadcast(&sync_wakeup);
 2853         kproc_shutdown(arg, howto);
 2854 }
 2855 
 2856 void
 2857 syncer_suspend(void)
 2858 {
 2859 
 2860         syncer_shutdown(updateproc, 0);
 2861 }
 2862 
 2863 void
 2864 syncer_resume(void)
 2865 {
 2866 
 2867         mtx_lock(&sync_mtx);
 2868         first_printf = 1;
 2869         syncer_state = SYNCER_RUNNING;
 2870         mtx_unlock(&sync_mtx);
 2871         cv_broadcast(&sync_wakeup);
 2872         kproc_resume(updateproc);
 2873 }
 2874 
 2875 /*
 2876  * Move the buffer between the clean and dirty lists of its vnode.
 2877  */
 2878 void
 2879 reassignbuf(struct buf *bp)
 2880 {
 2881         struct vnode *vp;
 2882         struct bufobj *bo;
 2883         int delay;
 2884 #ifdef INVARIANTS
 2885         struct bufv *bv;
 2886 #endif
 2887 
 2888         vp = bp->b_vp;
 2889         bo = bp->b_bufobj;
 2890 
 2891         KASSERT((bp->b_flags & B_PAGING) == 0,
 2892             ("%s: cannot reassign paging buffer %p", __func__, bp));
 2893 
 2894         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 2895             bp, bp->b_vp, bp->b_flags);
 2896 
 2897         BO_LOCK(bo);
 2898         buf_vlist_remove(bp);
 2899 
 2900         /*
 2901          * If dirty, put on list of dirty buffers; otherwise insert onto list
 2902          * of clean buffers.
 2903          */
 2904         if (bp->b_flags & B_DELWRI) {
 2905                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 2906                         switch (vp->v_type) {
 2907                         case VDIR:
 2908                                 delay = dirdelay;
 2909                                 break;
 2910                         case VCHR:
 2911                                 delay = metadelay;
 2912                                 break;
 2913                         default:
 2914                                 delay = filedelay;
 2915                         }
 2916                         vn_syncer_add_to_worklist(bo, delay);
 2917                 }
 2918                 buf_vlist_add(bp, bo, BX_VNDIRTY);
 2919         } else {
 2920                 buf_vlist_add(bp, bo, BX_VNCLEAN);
 2921 
 2922                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 2923                         mtx_lock(&sync_mtx);
 2924                         LIST_REMOVE(bo, bo_synclist);
 2925                         syncer_worklist_len--;
 2926                         mtx_unlock(&sync_mtx);
 2927                         bo->bo_flag &= ~BO_ONWORKLST;
 2928                 }
 2929         }
 2930 #ifdef INVARIANTS
 2931         bv = &bo->bo_clean;
 2932         bp = TAILQ_FIRST(&bv->bv_hd);
 2933         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2934             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2935         bp = TAILQ_LAST(&bv->bv_hd, buflists);
 2936         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2937             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2938         bv = &bo->bo_dirty;
 2939         bp = TAILQ_FIRST(&bv->bv_hd);
 2940         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2941             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2942         bp = TAILQ_LAST(&bv->bv_hd, buflists);
 2943         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2944             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2945 #endif
 2946         BO_UNLOCK(bo);
 2947 }
 2948 
 2949 static void
 2950 v_init_counters(struct vnode *vp)
 2951 {
 2952 
 2953         VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
 2954             vp, ("%s called for an initialized vnode", __FUNCTION__));
 2955         ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
 2956 
 2957         refcount_init(&vp->v_holdcnt, 1);
 2958         refcount_init(&vp->v_usecount, 1);
 2959 }
 2960 
 2961 /*
 2962  * Grab a particular vnode from the free list, increment its
 2963  * reference count and lock it.  VIRF_DOOMED is set if the vnode
 2964  * is being destroyed.  Only callers who specify LK_RETRY will
 2965  * see doomed vnodes.  If inactive processing was delayed in
 2966  * vput try to do it here.
 2967  *
 2968  * usecount is manipulated using atomics without holding any locks.
 2969  *
 2970  * holdcnt can be manipulated using atomics without holding any locks,
 2971  * except when transitioning 1<->0, in which case the interlock is held.
 2972  *
 2973  * Consumers which don't guarantee liveness of the vnode can use SMR to
 2974  * try to get a reference. Note this operation can fail since the vnode
 2975  * may be awaiting getting freed by the time they get to it.
 2976  */
 2977 enum vgetstate
 2978 vget_prep_smr(struct vnode *vp)
 2979 {
 2980         enum vgetstate vs;
 2981 
 2982         VFS_SMR_ASSERT_ENTERED();
 2983 
 2984         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 2985                 vs = VGET_USECOUNT;
 2986         } else {
 2987                 if (vhold_smr(vp))
 2988                         vs = VGET_HOLDCNT;
 2989                 else
 2990                         vs = VGET_NONE;
 2991         }
 2992         return (vs);
 2993 }
 2994 
 2995 enum vgetstate
 2996 vget_prep(struct vnode *vp)
 2997 {
 2998         enum vgetstate vs;
 2999 
 3000         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 3001                 vs = VGET_USECOUNT;
 3002         } else {
 3003                 vhold(vp);
 3004                 vs = VGET_HOLDCNT;
 3005         }
 3006         return (vs);
 3007 }
 3008 
 3009 void
 3010 vget_abort(struct vnode *vp, enum vgetstate vs)
 3011 {
 3012 
 3013         switch (vs) {
 3014         case VGET_USECOUNT:
 3015                 vrele(vp);
 3016                 break;
 3017         case VGET_HOLDCNT:
 3018                 vdrop(vp);
 3019                 break;
 3020         default:
 3021                 __assert_unreachable();
 3022         }
 3023 }
 3024 
 3025 int
 3026 vget(struct vnode *vp, int flags)
 3027 {
 3028         enum vgetstate vs;
 3029 
 3030         vs = vget_prep(vp);
 3031         return (vget_finish(vp, flags, vs));
 3032 }
 3033 
 3034 int
 3035 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
 3036 {
 3037         int error;
 3038 
 3039         if ((flags & LK_INTERLOCK) != 0)
 3040                 ASSERT_VI_LOCKED(vp, __func__);
 3041         else
 3042                 ASSERT_VI_UNLOCKED(vp, __func__);
 3043         VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 3044         VNPASS(vp->v_holdcnt > 0, vp);
 3045         VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 3046 
 3047         error = vn_lock(vp, flags);
 3048         if (__predict_false(error != 0)) {
 3049                 vget_abort(vp, vs);
 3050                 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 3051                     vp);
 3052                 return (error);
 3053         }
 3054 
 3055         vget_finish_ref(vp, vs);
 3056         return (0);
 3057 }
 3058 
 3059 void
 3060 vget_finish_ref(struct vnode *vp, enum vgetstate vs)
 3061 {
 3062         int old;
 3063 
 3064         VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 3065         VNPASS(vp->v_holdcnt > 0, vp);
 3066         VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 3067 
 3068         if (vs == VGET_USECOUNT)
 3069                 return;
 3070 
 3071         /*
 3072          * We hold the vnode. If the usecount is 0 it will be utilized to keep
 3073          * the vnode around. Otherwise someone else lended their hold count and
 3074          * we have to drop ours.
 3075          */
 3076         old = atomic_fetchadd_int(&vp->v_usecount, 1);
 3077         VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
 3078         if (old != 0) {
 3079 #ifdef INVARIANTS
 3080                 old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
 3081                 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
 3082 #else
 3083                 refcount_release(&vp->v_holdcnt);
 3084 #endif
 3085         }
 3086 }
 3087 
 3088 void
 3089 vref(struct vnode *vp)
 3090 {
 3091         enum vgetstate vs;
 3092 
 3093         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3094         vs = vget_prep(vp);
 3095         vget_finish_ref(vp, vs);
 3096 }
 3097 
 3098 void
 3099 vrefact(struct vnode *vp)
 3100 {
 3101 
 3102         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3103 #ifdef INVARIANTS
 3104         int old = atomic_fetchadd_int(&vp->v_usecount, 1);
 3105         VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
 3106 #else
 3107         refcount_acquire(&vp->v_usecount);
 3108 #endif
 3109 }
 3110 
 3111 void
 3112 vlazy(struct vnode *vp)
 3113 {
 3114         struct mount *mp;
 3115 
 3116         VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
 3117 
 3118         if ((vp->v_mflag & VMP_LAZYLIST) != 0)
 3119                 return;
 3120         /*
 3121          * We may get here for inactive routines after the vnode got doomed.
 3122          */
 3123         if (VN_IS_DOOMED(vp))
 3124                 return;
 3125         mp = vp->v_mount;
 3126         mtx_lock(&mp->mnt_listmtx);
 3127         if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
 3128                 vp->v_mflag |= VMP_LAZYLIST;
 3129                 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 3130                 mp->mnt_lazyvnodelistsize++;
 3131         }
 3132         mtx_unlock(&mp->mnt_listmtx);
 3133 }
 3134 
 3135 static void
 3136 vunlazy(struct vnode *vp)
 3137 {
 3138         struct mount *mp;
 3139 
 3140         ASSERT_VI_LOCKED(vp, __func__);
 3141         VNPASS(!VN_IS_DOOMED(vp), vp);
 3142 
 3143         mp = vp->v_mount;
 3144         mtx_lock(&mp->mnt_listmtx);
 3145         VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 3146         /*
 3147          * Don't remove the vnode from the lazy list if another thread
 3148          * has increased the hold count. It may have re-enqueued the
 3149          * vnode to the lazy list and is now responsible for its
 3150          * removal.
 3151          */
 3152         if (vp->v_holdcnt == 0) {
 3153                 vp->v_mflag &= ~VMP_LAZYLIST;
 3154                 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 3155                 mp->mnt_lazyvnodelistsize--;
 3156         }
 3157         mtx_unlock(&mp->mnt_listmtx);
 3158 }
 3159 
 3160 /*
 3161  * This routine is only meant to be called from vgonel prior to dooming
 3162  * the vnode.
 3163  */
 3164 static void
 3165 vunlazy_gone(struct vnode *vp)
 3166 {
 3167         struct mount *mp;
 3168 
 3169         ASSERT_VOP_ELOCKED(vp, __func__);
 3170         ASSERT_VI_LOCKED(vp, __func__);
 3171         VNPASS(!VN_IS_DOOMED(vp), vp);
 3172 
 3173         if (vp->v_mflag & VMP_LAZYLIST) {
 3174                 mp = vp->v_mount;
 3175                 mtx_lock(&mp->mnt_listmtx);
 3176                 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 3177                 vp->v_mflag &= ~VMP_LAZYLIST;
 3178                 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 3179                 mp->mnt_lazyvnodelistsize--;
 3180                 mtx_unlock(&mp->mnt_listmtx);
 3181         }
 3182 }
 3183 
 3184 static void
 3185 vdefer_inactive(struct vnode *vp)
 3186 {
 3187 
 3188         ASSERT_VI_LOCKED(vp, __func__);
 3189         VNASSERT(vp->v_holdcnt > 0, vp,
 3190             ("%s: vnode without hold count", __func__));
 3191         if (VN_IS_DOOMED(vp)) {
 3192                 vdropl(vp);
 3193                 return;
 3194         }
 3195         if (vp->v_iflag & VI_DEFINACT) {
 3196                 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 3197                 vdropl(vp);
 3198                 return;
 3199         }
 3200         if (vp->v_usecount > 0) {
 3201                 vp->v_iflag &= ~VI_OWEINACT;
 3202                 vdropl(vp);
 3203                 return;
 3204         }
 3205         vlazy(vp);
 3206         vp->v_iflag |= VI_DEFINACT;
 3207         VI_UNLOCK(vp);
 3208         counter_u64_add(deferred_inact, 1);
 3209 }
 3210 
 3211 static void
 3212 vdefer_inactive_unlocked(struct vnode *vp)
 3213 {
 3214 
 3215         VI_LOCK(vp);
 3216         if ((vp->v_iflag & VI_OWEINACT) == 0) {
 3217                 vdropl(vp);
 3218                 return;
 3219         }
 3220         vdefer_inactive(vp);
 3221 }
 3222 
 3223 enum vput_op { VRELE, VPUT, VUNREF };
 3224 
 3225 /*
 3226  * Handle ->v_usecount transitioning to 0.
 3227  *
 3228  * By releasing the last usecount we take ownership of the hold count which
 3229  * provides liveness of the vnode, meaning we have to vdrop.
 3230  *
 3231  * For all vnodes we may need to perform inactive processing. It requires an
 3232  * exclusive lock on the vnode, while it is legal to call here with only a
 3233  * shared lock (or no locks). If locking the vnode in an expected manner fails,
 3234  * inactive processing gets deferred to the syncer.
 3235  *
 3236  * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
 3237  * on the lock being held all the way until VOP_INACTIVE. This in particular
 3238  * happens with UFS which adds half-constructed vnodes to the hash, where they
 3239  * can be found by other code.
 3240  */
 3241 static void
 3242 vput_final(struct vnode *vp, enum vput_op func)
 3243 {
 3244         int error;
 3245         bool want_unlock;
 3246 
 3247         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3248         VNPASS(vp->v_holdcnt > 0, vp);
 3249 
 3250         VI_LOCK(vp);
 3251 
 3252         /*
 3253          * By the time we got here someone else might have transitioned
 3254          * the count back to > 0.
 3255          */
 3256         if (vp->v_usecount > 0)
 3257                 goto out;
 3258 
 3259         /*
 3260          * If the vnode is doomed vgone already performed inactive processing
 3261          * (if needed).
 3262          */
 3263         if (VN_IS_DOOMED(vp))
 3264                 goto out;
 3265 
 3266         if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
 3267                 goto out;
 3268 
 3269         if (vp->v_iflag & VI_DOINGINACT)
 3270                 goto out;
 3271 
 3272         /*
 3273          * Locking operations here will drop the interlock and possibly the
 3274          * vnode lock, opening a window where the vnode can get doomed all the
 3275          * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
 3276          * perform inactive.
 3277          */
 3278         vp->v_iflag |= VI_OWEINACT;
 3279         want_unlock = false;
 3280         error = 0;
 3281         switch (func) {
 3282         case VRELE:
 3283                 switch (VOP_ISLOCKED(vp)) {
 3284                 case LK_EXCLUSIVE:
 3285                         break;
 3286                 case LK_EXCLOTHER:
 3287                 case 0:
 3288                         want_unlock = true;
 3289                         error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 3290                         VI_LOCK(vp);
 3291                         break;
 3292                 default:
 3293                         /*
 3294                          * The lock has at least one sharer, but we have no way
 3295                          * to conclude whether this is us. Play it safe and
 3296                          * defer processing.
 3297                          */
 3298                         error = EAGAIN;
 3299                         break;
 3300                 }
 3301                 break;
 3302         case VPUT:
 3303                 want_unlock = true;
 3304                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 3305                         error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
 3306                             LK_NOWAIT);
 3307                         VI_LOCK(vp);
 3308                 }
 3309                 break;
 3310         case VUNREF:
 3311                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 3312                         error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 3313                         VI_LOCK(vp);
 3314                 }
 3315                 break;
 3316         }
 3317         if (error == 0) {
 3318                 if (func == VUNREF) {
 3319                         VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
 3320                             ("recursive vunref"));
 3321                         vp->v_vflag |= VV_UNREF;
 3322                 }
 3323                 for (;;) {
 3324                         error = vinactive(vp);
 3325                         if (want_unlock)
 3326                                 VOP_UNLOCK(vp);
 3327                         if (error != ERELOOKUP || !want_unlock)
 3328                                 break;
 3329                         VOP_LOCK(vp, LK_EXCLUSIVE);
 3330                 }
 3331                 if (func == VUNREF)
 3332                         vp->v_vflag &= ~VV_UNREF;
 3333                 vdropl(vp);
 3334         } else {
 3335                 vdefer_inactive(vp);
 3336         }
 3337         return;
 3338 out:
 3339         if (func == VPUT)
 3340                 VOP_UNLOCK(vp);
 3341         vdropl(vp);
 3342 }
 3343 
 3344 /*
 3345  * Decrement ->v_usecount for a vnode.
 3346  *
 3347  * Releasing the last use count requires additional processing, see vput_final
 3348  * above for details.
 3349  *
 3350  * Comment above each variant denotes lock state on entry and exit.
 3351  */
 3352 
 3353 /*
 3354  * in: any
 3355  * out: same as passed in
 3356  */
 3357 void
 3358 vrele(struct vnode *vp)
 3359 {
 3360 
 3361         ASSERT_VI_UNLOCKED(vp, __func__);
 3362         if (!refcount_release(&vp->v_usecount))
 3363                 return;
 3364         vput_final(vp, VRELE);
 3365 }
 3366 
 3367 /*
 3368  * in: locked
 3369  * out: unlocked
 3370  */
 3371 void
 3372 vput(struct vnode *vp)
 3373 {
 3374 
 3375         ASSERT_VOP_LOCKED(vp, __func__);
 3376         ASSERT_VI_UNLOCKED(vp, __func__);
 3377         if (!refcount_release(&vp->v_usecount)) {
 3378                 VOP_UNLOCK(vp);
 3379                 return;
 3380         }
 3381         vput_final(vp, VPUT);
 3382 }
 3383 
 3384 /*
 3385  * in: locked
 3386  * out: locked
 3387  */
 3388 void
 3389 vunref(struct vnode *vp)
 3390 {
 3391 
 3392         ASSERT_VOP_LOCKED(vp, __func__);
 3393         ASSERT_VI_UNLOCKED(vp, __func__);
 3394         if (!refcount_release(&vp->v_usecount))
 3395                 return;
 3396         vput_final(vp, VUNREF);
 3397 }
 3398 
 3399 void
 3400 vhold(struct vnode *vp)
 3401 {
 3402         int old;
 3403 
 3404         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3405         old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 3406         VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 3407             ("%s: wrong hold count %d", __func__, old));
 3408         if (old == 0)
 3409                 vfs_freevnodes_dec();
 3410 }
 3411 
 3412 void
 3413 vholdnz(struct vnode *vp)
 3414 {
 3415 
 3416         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3417 #ifdef INVARIANTS
 3418         int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 3419         VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 3420             ("%s: wrong hold count %d", __func__, old));
 3421 #else
 3422         atomic_add_int(&vp->v_holdcnt, 1);
 3423 #endif
 3424 }
 3425 
 3426 /*
 3427  * Grab a hold count unless the vnode is freed.
 3428  *
 3429  * Only use this routine if vfs smr is the only protection you have against
 3430  * freeing the vnode.
 3431  *
 3432  * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
 3433  * is not set.  After the flag is set the vnode becomes immutable to anyone but
 3434  * the thread which managed to set the flag.
 3435  *
 3436  * It may be tempting to replace the loop with:
 3437  * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 3438  * if (count & VHOLD_NO_SMR) {
 3439  *     backpedal and error out;
 3440  * }
 3441  *
 3442  * However, while this is more performant, it hinders debugging by eliminating
 3443  * the previously mentioned invariant.
 3444  */
 3445 bool
 3446 vhold_smr(struct vnode *vp)
 3447 {
 3448         int count;
 3449 
 3450         VFS_SMR_ASSERT_ENTERED();
 3451 
 3452         count = atomic_load_int(&vp->v_holdcnt);
 3453         for (;;) {
 3454                 if (count & VHOLD_NO_SMR) {
 3455                         VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 3456                             ("non-zero hold count with flags %d\n", count));
 3457                         return (false);
 3458                 }
 3459                 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 3460                 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 3461                         if (count == 0)
 3462                                 vfs_freevnodes_dec();
 3463                         return (true);
 3464                 }
 3465         }
 3466 }
 3467 
 3468 /*
 3469  * Hold a free vnode for recycling.
 3470  *
 3471  * Note: vnode_init references this comment.
 3472  *
 3473  * Attempts to recycle only need the global vnode list lock and have no use for
 3474  * SMR.
 3475  *
 3476  * However, vnodes get inserted into the global list before they get fully
 3477  * initialized and stay there until UMA decides to free the memory. This in
 3478  * particular means the target can be found before it becomes usable and after
 3479  * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to
 3480  * VHOLD_NO_SMR.
 3481  *
 3482  * Note: the vnode may gain more references after we transition the count 0->1.
 3483  */
 3484 static bool
 3485 vhold_recycle_free(struct vnode *vp)
 3486 {
 3487         int count;
 3488 
 3489         mtx_assert(&vnode_list_mtx, MA_OWNED);
 3490 
 3491         count = atomic_load_int(&vp->v_holdcnt);
 3492         for (;;) {
 3493                 if (count & VHOLD_NO_SMR) {
 3494                         VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 3495                             ("non-zero hold count with flags %d\n", count));
 3496                         return (false);
 3497                 }
 3498                 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 3499                 if (count > 0) {
 3500                         return (false);
 3501                 }
 3502                 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 3503                         vfs_freevnodes_dec();
 3504                         return (true);
 3505                 }
 3506         }
 3507 }
 3508 
 3509 static void __noinline
 3510 vdbatch_process(struct vdbatch *vd)
 3511 {
 3512         struct vnode *vp;
 3513         int i;
 3514 
 3515         mtx_assert(&vd->lock, MA_OWNED);
 3516         MPASS(curthread->td_pinned > 0);
 3517         MPASS(vd->index == VDBATCH_SIZE);
 3518 
 3519         mtx_lock(&vnode_list_mtx);
 3520         critical_enter();
 3521         freevnodes += vd->freevnodes;
 3522         for (i = 0; i < VDBATCH_SIZE; i++) {
 3523                 vp = vd->tab[i];
 3524                 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 3525                 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
 3526                 MPASS(vp->v_dbatchcpu != NOCPU);
 3527                 vp->v_dbatchcpu = NOCPU;
 3528         }
 3529         mtx_unlock(&vnode_list_mtx);
 3530         vd->freevnodes = 0;
 3531         bzero(vd->tab, sizeof(vd->tab));
 3532         vd->index = 0;
 3533         critical_exit();
 3534 }
 3535 
 3536 static void
 3537 vdbatch_enqueue(struct vnode *vp)
 3538 {
 3539         struct vdbatch *vd;
 3540 
 3541         ASSERT_VI_LOCKED(vp, __func__);
 3542         VNASSERT(!VN_IS_DOOMED(vp), vp,
 3543             ("%s: deferring requeue of a doomed vnode", __func__));
 3544 
 3545         if (vp->v_dbatchcpu != NOCPU) {
 3546                 VI_UNLOCK(vp);
 3547                 return;
 3548         }
 3549 
 3550         sched_pin();
 3551         vd = DPCPU_PTR(vd);
 3552         mtx_lock(&vd->lock);
 3553         MPASS(vd->index < VDBATCH_SIZE);
 3554         MPASS(vd->tab[vd->index] == NULL);
 3555         /*
 3556          * A hack: we depend on being pinned so that we know what to put in
 3557          * ->v_dbatchcpu.
 3558          */
 3559         vp->v_dbatchcpu = curcpu;
 3560         vd->tab[vd->index] = vp;
 3561         vd->index++;
 3562         VI_UNLOCK(vp);
 3563         if (vd->index == VDBATCH_SIZE)
 3564                 vdbatch_process(vd);
 3565         mtx_unlock(&vd->lock);
 3566         sched_unpin();
 3567 }
 3568 
 3569 /*
 3570  * This routine must only be called for vnodes which are about to be
 3571  * deallocated. Supporting dequeue for arbitrary vndoes would require
 3572  * validating that the locked batch matches.
 3573  */
 3574 static void
 3575 vdbatch_dequeue(struct vnode *vp)
 3576 {
 3577         struct vdbatch *vd;
 3578         int i;
 3579         short cpu;
 3580 
 3581         VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
 3582             ("%s: called for a used vnode\n", __func__));
 3583 
 3584         cpu = vp->v_dbatchcpu;
 3585         if (cpu == NOCPU)
 3586                 return;
 3587 
 3588         vd = DPCPU_ID_PTR(cpu, vd);
 3589         mtx_lock(&vd->lock);
 3590         for (i = 0; i < vd->index; i++) {
 3591                 if (vd->tab[i] != vp)
 3592                         continue;
 3593                 vp->v_dbatchcpu = NOCPU;
 3594                 vd->index--;
 3595                 vd->tab[i] = vd->tab[vd->index];
 3596                 vd->tab[vd->index] = NULL;
 3597                 break;
 3598         }
 3599         mtx_unlock(&vd->lock);
 3600         /*
 3601          * Either we dequeued the vnode above or the target CPU beat us to it.
 3602          */
 3603         MPASS(vp->v_dbatchcpu == NOCPU);
 3604 }
 3605 
 3606 /*
 3607  * Drop the hold count of the vnode.  If this is the last reference to
 3608  * the vnode we place it on the free list unless it has been vgone'd
 3609  * (marked VIRF_DOOMED) in which case we will free it.
 3610  *
 3611  * Because the vnode vm object keeps a hold reference on the vnode if
 3612  * there is at least one resident non-cached page, the vnode cannot
 3613  * leave the active list without the page cleanup done.
 3614  */
 3615 static void __noinline
 3616 vdropl_final(struct vnode *vp)
 3617 {
 3618 
 3619         ASSERT_VI_LOCKED(vp, __func__);
 3620         VNPASS(VN_IS_DOOMED(vp), vp);
 3621         /*
 3622          * Set the VHOLD_NO_SMR flag.
 3623          *
 3624          * We may be racing against vhold_smr. If they win we can just pretend
 3625          * we never got this far, they will vdrop later.
 3626          */
 3627         if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
 3628                 vfs_freevnodes_inc();
 3629                 VI_UNLOCK(vp);
 3630                 /*
 3631                  * We lost the aforementioned race. Any subsequent access is
 3632                  * invalid as they might have managed to vdropl on their own.
 3633                  */
 3634                 return;
 3635         }
 3636         /*
 3637          * Don't bump freevnodes as this one is going away.
 3638          */
 3639         freevnode(vp);
 3640 }
 3641 
 3642 void
 3643 vdrop(struct vnode *vp)
 3644 {
 3645 
 3646         ASSERT_VI_UNLOCKED(vp, __func__);
 3647         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3648         if (refcount_release_if_not_last(&vp->v_holdcnt))
 3649                 return;
 3650         VI_LOCK(vp);
 3651         vdropl(vp);
 3652 }
 3653 
 3654 static void __always_inline
 3655 vdropl_impl(struct vnode *vp, bool enqueue)
 3656 {
 3657 
 3658         ASSERT_VI_LOCKED(vp, __func__);
 3659         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3660         if (!refcount_release(&vp->v_holdcnt)) {
 3661                 VI_UNLOCK(vp);
 3662                 return;
 3663         }
 3664         VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp);
 3665         VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
 3666         if (VN_IS_DOOMED(vp)) {
 3667                 vdropl_final(vp);
 3668                 return;
 3669         }
 3670 
 3671         vfs_freevnodes_inc();
 3672         if (vp->v_mflag & VMP_LAZYLIST) {
 3673                 vunlazy(vp);
 3674         }
 3675 
 3676         if (!enqueue) {
 3677                 VI_UNLOCK(vp);
 3678                 return;
 3679         }
 3680 
 3681         /*
 3682          * Also unlocks the interlock. We can't assert on it as we
 3683          * released our hold and by now the vnode might have been
 3684          * freed.
 3685          */
 3686         vdbatch_enqueue(vp);
 3687 }
 3688 
 3689 void
 3690 vdropl(struct vnode *vp)
 3691 {
 3692 
 3693         vdropl_impl(vp, true);
 3694 }
 3695 
 3696 /*
 3697  * vdrop a vnode when recycling
 3698  *
 3699  * This is a special case routine only to be used when recycling, differs from
 3700  * regular vdrop by not requeieing the vnode on LRU.
 3701  *
 3702  * Consider a case where vtryrecycle continuously fails with all vnodes (due to
 3703  * e.g., frozen writes on the filesystem), filling the batch and causing it to
 3704  * be requeued. Then vnlru will end up revisiting the same vnodes. This is a
 3705  * loop which can last for as long as writes are frozen.
 3706  */
 3707 static void
 3708 vdropl_recycle(struct vnode *vp)
 3709 {
 3710 
 3711         vdropl_impl(vp, false);
 3712 }
 3713 
 3714 static void
 3715 vdrop_recycle(struct vnode *vp)
 3716 {
 3717 
 3718         VI_LOCK(vp);
 3719         vdropl_recycle(vp);
 3720 }
 3721 
 3722 /*
 3723  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
 3724  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
 3725  */
 3726 static int
 3727 vinactivef(struct vnode *vp)
 3728 {
 3729         struct vm_object *obj;
 3730         int error;
 3731 
 3732         ASSERT_VOP_ELOCKED(vp, "vinactive");
 3733         ASSERT_VI_LOCKED(vp, "vinactive");
 3734         VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 3735             ("vinactive: recursed on VI_DOINGINACT"));
 3736         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3737         vp->v_iflag |= VI_DOINGINACT;
 3738         vp->v_iflag &= ~VI_OWEINACT;
 3739         VI_UNLOCK(vp);
 3740         /*
 3741          * Before moving off the active list, we must be sure that any
 3742          * modified pages are converted into the vnode's dirty
 3743          * buffers, since these will no longer be checked once the
 3744          * vnode is on the inactive list.
 3745          *
 3746          * The write-out of the dirty pages is asynchronous.  At the
 3747          * point that VOP_INACTIVE() is called, there could still be
 3748          * pending I/O and dirty pages in the object.
 3749          */
 3750         if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 3751             vm_object_mightbedirty(obj)) {
 3752                 VM_OBJECT_WLOCK(obj);
 3753                 vm_object_page_clean(obj, 0, 0, 0);
 3754                 VM_OBJECT_WUNLOCK(obj);
 3755         }
 3756         error = VOP_INACTIVE(vp);
 3757         VI_LOCK(vp);
 3758         VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 3759             ("vinactive: lost VI_DOINGINACT"));
 3760         vp->v_iflag &= ~VI_DOINGINACT;
 3761         return (error);
 3762 }
 3763 
 3764 int
 3765 vinactive(struct vnode *vp)
 3766 {
 3767 
 3768         ASSERT_VOP_ELOCKED(vp, "vinactive");
 3769         ASSERT_VI_LOCKED(vp, "vinactive");
 3770         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3771 
 3772         if ((vp->v_iflag & VI_OWEINACT) == 0)
 3773                 return (0);
 3774         if (vp->v_iflag & VI_DOINGINACT)
 3775                 return (0);
 3776         if (vp->v_usecount > 0) {
 3777                 vp->v_iflag &= ~VI_OWEINACT;
 3778                 return (0);
 3779         }
 3780         return (vinactivef(vp));
 3781 }
 3782 
 3783 /*
 3784  * Remove any vnodes in the vnode table belonging to mount point mp.
 3785  *
 3786  * If FORCECLOSE is not specified, there should not be any active ones,
 3787  * return error if any are found (nb: this is a user error, not a
 3788  * system error). If FORCECLOSE is specified, detach any active vnodes
 3789  * that are found.
 3790  *
 3791  * If WRITECLOSE is set, only flush out regular file vnodes open for
 3792  * writing.
 3793  *
 3794  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
 3795  *
 3796  * `rootrefs' specifies the base reference count for the root vnode
 3797  * of this filesystem. The root vnode is considered busy if its
 3798  * v_usecount exceeds this value. On a successful return, vflush(, td)
 3799  * will call vrele() on the root vnode exactly rootrefs times.
 3800  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
 3801  * be zero.
 3802  */
 3803 #ifdef DIAGNOSTIC
 3804 static int busyprt = 0;         /* print out busy vnodes */
 3805 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 3806 #endif
 3807 
 3808 int
 3809 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 3810 {
 3811         struct vnode *vp, *mvp, *rootvp = NULL;
 3812         struct vattr vattr;
 3813         int busy = 0, error;
 3814 
 3815         CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 3816             rootrefs, flags);
 3817         if (rootrefs > 0) {
 3818                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 3819                     ("vflush: bad args"));
 3820                 /*
 3821                  * Get the filesystem root vnode. We can vput() it
 3822                  * immediately, since with rootrefs > 0, it won't go away.
 3823                  */
 3824                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 3825                         CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 3826                             __func__, error);
 3827                         return (error);
 3828                 }
 3829                 vput(rootvp);
 3830         }
 3831 loop:
 3832         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 3833                 vholdl(vp);
 3834                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 3835                 if (error) {
 3836                         vdrop(vp);
 3837                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 3838                         goto loop;
 3839                 }
 3840                 /*
 3841                  * Skip over a vnodes marked VV_SYSTEM.
 3842                  */
 3843                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 3844                         VOP_UNLOCK(vp);
 3845                         vdrop(vp);
 3846                         continue;
 3847                 }
 3848                 /*
 3849                  * If WRITECLOSE is set, flush out unlinked but still open
 3850                  * files (even if open only for reading) and regular file
 3851                  * vnodes open for writing.
 3852                  */
 3853                 if (flags & WRITECLOSE) {
 3854                         if (vp->v_object != NULL) {
 3855                                 VM_OBJECT_WLOCK(vp->v_object);
 3856                                 vm_object_page_clean(vp->v_object, 0, 0, 0);
 3857                                 VM_OBJECT_WUNLOCK(vp->v_object);
 3858                         }
 3859                         do {
 3860                                 error = VOP_FSYNC(vp, MNT_WAIT, td);
 3861                         } while (error == ERELOOKUP);
 3862                         if (error != 0) {
 3863                                 VOP_UNLOCK(vp);
 3864                                 vdrop(vp);
 3865                                 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 3866                                 return (error);
 3867                         }
 3868                         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 3869                         VI_LOCK(vp);
 3870 
 3871                         if ((vp->v_type == VNON ||
 3872                             (error == 0 && vattr.va_nlink > 0)) &&
 3873                             (vp->v_writecount <= 0 || vp->v_type != VREG)) {
 3874                                 VOP_UNLOCK(vp);
 3875                                 vdropl(vp);
 3876                                 continue;
 3877                         }
 3878                 } else
 3879                         VI_LOCK(vp);
 3880                 /*
 3881                  * With v_usecount == 0, all we need to do is clear out the
 3882                  * vnode data structures and we are done.
 3883                  *
 3884                  * If FORCECLOSE is set, forcibly close the vnode.
 3885                  */
 3886                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 3887                         vgonel(vp);
 3888                 } else {
 3889                         busy++;
 3890 #ifdef DIAGNOSTIC
 3891                         if (busyprt)
 3892                                 vn_printf(vp, "vflush: busy vnode ");
 3893 #endif
 3894                 }
 3895                 VOP_UNLOCK(vp);
 3896                 vdropl(vp);
 3897         }
 3898         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 3899                 /*
 3900                  * If just the root vnode is busy, and if its refcount
 3901                  * is equal to `rootrefs', then go ahead and kill it.
 3902                  */
 3903                 VI_LOCK(rootvp);
 3904                 KASSERT(busy > 0, ("vflush: not busy"));
 3905                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 3906                     ("vflush: usecount %d < rootrefs %d",
 3907                      rootvp->v_usecount, rootrefs));
 3908                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
 3909                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 3910                         vgone(rootvp);
 3911                         VOP_UNLOCK(rootvp);
 3912                         busy = 0;
 3913                 } else
 3914                         VI_UNLOCK(rootvp);
 3915         }
 3916         if (busy) {
 3917                 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 3918                     busy);
 3919                 return (EBUSY);
 3920         }
 3921         for (; rootrefs > 0; rootrefs--)
 3922                 vrele(rootvp);
 3923         return (0);
 3924 }
 3925 
 3926 /*
 3927  * Recycle an unused vnode to the front of the free list.
 3928  */
 3929 int
 3930 vrecycle(struct vnode *vp)
 3931 {
 3932         int recycled;
 3933 
 3934         VI_LOCK(vp);
 3935         recycled = vrecyclel(vp);
 3936         VI_UNLOCK(vp);
 3937         return (recycled);
 3938 }
 3939 
 3940 /*
 3941  * vrecycle, with the vp interlock held.
 3942  */
 3943 int
 3944 vrecyclel(struct vnode *vp)
 3945 {
 3946         int recycled;
 3947 
 3948         ASSERT_VOP_ELOCKED(vp, __func__);
 3949         ASSERT_VI_LOCKED(vp, __func__);
 3950         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3951         recycled = 0;
 3952         if (vp->v_usecount == 0) {
 3953                 recycled = 1;
 3954                 vgonel(vp);
 3955         }
 3956         return (recycled);
 3957 }
 3958 
 3959 /*
 3960  * Eliminate all activity associated with a vnode
 3961  * in preparation for reuse.
 3962  */
 3963 void
 3964 vgone(struct vnode *vp)
 3965 {
 3966         VI_LOCK(vp);
 3967         vgonel(vp);
 3968         VI_UNLOCK(vp);
 3969 }
 3970 
 3971 static void
 3972 notify_lowervp_vfs_dummy(struct mount *mp __unused,
 3973     struct vnode *lowervp __unused)
 3974 {
 3975 }
 3976 
 3977 /*
 3978  * Notify upper mounts about reclaimed or unlinked vnode.
 3979  */
 3980 void
 3981 vfs_notify_upper(struct vnode *vp, int event)
 3982 {
 3983         static struct vfsops vgonel_vfsops = {
 3984                 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
 3985                 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
 3986         };
 3987         struct mount *mp, *ump, *mmp;
 3988 
 3989         mp = vp->v_mount;
 3990         if (mp == NULL)
 3991                 return;
 3992         if (TAILQ_EMPTY(&mp->mnt_uppers))
 3993                 return;
 3994 
 3995         mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
 3996         mmp->mnt_op = &vgonel_vfsops;
 3997         mmp->mnt_kern_flag |= MNTK_MARKER;
 3998         MNT_ILOCK(mp);
 3999         mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
 4000         for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
 4001                 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
 4002                         ump = TAILQ_NEXT(ump, mnt_upper_link);
 4003                         continue;
 4004                 }
 4005                 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
 4006                 MNT_IUNLOCK(mp);
 4007                 switch (event) {
 4008                 case VFS_NOTIFY_UPPER_RECLAIM:
 4009                         VFS_RECLAIM_LOWERVP(ump, vp);
 4010                         break;
 4011                 case VFS_NOTIFY_UPPER_UNLINK:
 4012                         VFS_UNLINK_LOWERVP(ump, vp);
 4013                         break;
 4014                 default:
 4015                         KASSERT(0, ("invalid event %d", event));
 4016                         break;
 4017                 }
 4018                 MNT_ILOCK(mp);
 4019                 ump = TAILQ_NEXT(mmp, mnt_upper_link);
 4020                 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
 4021         }
 4022         free(mmp, M_TEMP);
 4023         mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
 4024         if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
 4025                 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
 4026                 wakeup(&mp->mnt_uppers);
 4027         }
 4028         MNT_IUNLOCK(mp);
 4029 }
 4030 
 4031 /*
 4032  * vgone, with the vp interlock held.
 4033  */
 4034 static void
 4035 vgonel(struct vnode *vp)
 4036 {
 4037         struct thread *td;
 4038         struct mount *mp;
 4039         vm_object_t object;
 4040         bool active, doinginact, oweinact;
 4041 
 4042         ASSERT_VOP_ELOCKED(vp, "vgonel");
 4043         ASSERT_VI_LOCKED(vp, "vgonel");
 4044         VNASSERT(vp->v_holdcnt, vp,
 4045             ("vgonel: vp %p has no reference.", vp));
 4046         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 4047         td = curthread;
 4048 
 4049         /*
 4050          * Don't vgonel if we're already doomed.
 4051          */
 4052         if (VN_IS_DOOMED(vp))
 4053                 return;
 4054         /*
 4055          * Paired with freevnode.
 4056          */
 4057         vn_seqc_write_begin_locked(vp);
 4058         vunlazy_gone(vp);
 4059         vn_irflag_set_locked(vp, VIRF_DOOMED);
 4060 
 4061         /*
 4062          * Check to see if the vnode is in use.  If so, we have to
 4063          * call VOP_CLOSE() and VOP_INACTIVE().
 4064          *
 4065          * It could be that VOP_INACTIVE() requested reclamation, in
 4066          * which case we should avoid recursion, so check
 4067          * VI_DOINGINACT.  This is not precise but good enough.
 4068          */
 4069         active = vp->v_usecount > 0;
 4070         oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 4071         doinginact = (vp->v_iflag & VI_DOINGINACT) != 0;
 4072 
 4073         /*
 4074          * If we need to do inactive VI_OWEINACT will be set.
 4075          */
 4076         if (vp->v_iflag & VI_DEFINACT) {
 4077                 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 4078                 vp->v_iflag &= ~VI_DEFINACT;
 4079                 vdropl(vp);
 4080         } else {
 4081                 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
 4082                 VI_UNLOCK(vp);
 4083         }
 4084         cache_purge_vgone(vp);
 4085         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 4086 
 4087         /*
 4088          * If purging an active vnode, it must be closed and
 4089          * deactivated before being reclaimed.
 4090          */
 4091         if (active)
 4092                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 4093         if (!doinginact) {
 4094                 do {
 4095                         if (oweinact || active) {
 4096                                 VI_LOCK(vp);
 4097                                 vinactivef(vp);
 4098                                 oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 4099                                 VI_UNLOCK(vp);
 4100                         }
 4101                 } while (oweinact);
 4102         }
 4103         if (vp->v_type == VSOCK)
 4104                 vfs_unp_reclaim(vp);
 4105 
 4106         /*
 4107          * Clean out any buffers associated with the vnode.
 4108          * If the flush fails, just toss the buffers.
 4109          */
 4110         mp = NULL;
 4111         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 4112                 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
 4113         if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 4114                 while (vinvalbuf(vp, 0, 0, 0) != 0)
 4115                         ;
 4116         }
 4117 
 4118         BO_LOCK(&vp->v_bufobj);
 4119         KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 4120             vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 4121             TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 4122             vp->v_bufobj.bo_clean.bv_cnt == 0,
 4123             ("vp %p bufobj not invalidated", vp));
 4124 
 4125         /*
 4126          * For VMIO bufobj, BO_DEAD is set later, or in
 4127          * vm_object_terminate() after the object's page queue is
 4128          * flushed.
 4129          */
 4130         object = vp->v_bufobj.bo_object;
 4131         if (object == NULL)
 4132                 vp->v_bufobj.bo_flag |= BO_DEAD;
 4133         BO_UNLOCK(&vp->v_bufobj);
 4134 
 4135         /*
 4136          * Handle the VM part.  Tmpfs handles v_object on its own (the
 4137          * OBJT_VNODE check).  Nullfs or other bypassing filesystems
 4138          * should not touch the object borrowed from the lower vnode
 4139          * (the handle check).
 4140          */
 4141         if (object != NULL && object->type == OBJT_VNODE &&
 4142             object->handle == vp)
 4143                 vnode_destroy_vobject(vp);
 4144 
 4145         /*
 4146          * Reclaim the vnode.
 4147          */
 4148         if (VOP_RECLAIM(vp))
 4149                 panic("vgone: cannot reclaim");
 4150         if (mp != NULL)
 4151                 vn_finished_secondary_write(mp);
 4152         VNASSERT(vp->v_object == NULL, vp,
 4153             ("vop_reclaim left v_object vp=%p", vp));
 4154         /*
 4155          * Clear the advisory locks and wake up waiting threads.
 4156          */
 4157         (void)VOP_ADVLOCKPURGE(vp);
 4158         vp->v_lockf = NULL;
 4159         /*
 4160          * Delete from old mount point vnode list.
 4161          */
 4162         delmntque(vp);
 4163         /*
 4164          * Done with purge, reset to the standard lock and invalidate
 4165          * the vnode.
 4166          */
 4167         VI_LOCK(vp);
 4168         vp->v_vnlock = &vp->v_lock;
 4169         vp->v_op = &dead_vnodeops;
 4170         vp->v_type = VBAD;
 4171 }
 4172 
 4173 /*
 4174  * Print out a description of a vnode.
 4175  */
 4176 static const char * const typename[] =
 4177 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
 4178  "VMARKER"};
 4179 
 4180 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
 4181     "new hold count flag not added to vn_printf");
 4182 
 4183 void
 4184 vn_printf(struct vnode *vp, const char *fmt, ...)
 4185 {
 4186         va_list ap;
 4187         char buf[256], buf2[16];
 4188         u_long flags;
 4189         u_int holdcnt;
 4190         short irflag;
 4191 
 4192         va_start(ap, fmt);
 4193         vprintf(fmt, ap);
 4194         va_end(ap);
 4195         printf("%p: ", (void *)vp);
 4196         printf("type %s\n", typename[vp->v_type]);
 4197         holdcnt = atomic_load_int(&vp->v_holdcnt);
 4198         printf("    usecount %d, writecount %d, refcount %d seqc users %d",
 4199             vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
 4200             vp->v_seqc_users);
 4201         switch (vp->v_type) {
 4202         case VDIR:
 4203                 printf(" mountedhere %p\n", vp->v_mountedhere);
 4204                 break;
 4205         case VCHR:
 4206                 printf(" rdev %p\n", vp->v_rdev);
 4207                 break;
 4208         case VSOCK:
 4209                 printf(" socket %p\n", vp->v_unpcb);
 4210                 break;
 4211         case VFIFO:
 4212                 printf(" fifoinfo %p\n", vp->v_fifoinfo);
 4213                 break;
 4214         default:
 4215                 printf("\n");
 4216                 break;
 4217         }
 4218         buf[0] = '\0';
 4219         buf[1] = '\0';
 4220         if (holdcnt & VHOLD_NO_SMR)
 4221                 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
 4222         printf("    hold count flags (%s)\n", buf + 1);
 4223 
 4224         buf[0] = '\0';
 4225         buf[1] = '\0';
 4226         irflag = vn_irflag_read(vp);
 4227         if (irflag & VIRF_DOOMED)
 4228                 strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
 4229         if (irflag & VIRF_PGREAD)
 4230                 strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
 4231         if (irflag & VIRF_MOUNTPOINT)
 4232                 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf));
 4233         flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT);
 4234         if (flags != 0) {
 4235                 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
 4236                 strlcat(buf, buf2, sizeof(buf));
 4237         }
 4238         if (vp->v_vflag & VV_ROOT)
 4239                 strlcat(buf, "|VV_ROOT", sizeof(buf));
 4240         if (vp->v_vflag & VV_ISTTY)
 4241                 strlcat(buf, "|VV_ISTTY", sizeof(buf));
 4242         if (vp->v_vflag & VV_NOSYNC)
 4243                 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 4244         if (vp->v_vflag & VV_ETERNALDEV)
 4245                 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 4246         if (vp->v_vflag & VV_CACHEDLABEL)
 4247                 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 4248         if (vp->v_vflag & VV_VMSIZEVNLOCK)
 4249                 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
 4250         if (vp->v_vflag & VV_COPYONWRITE)
 4251                 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 4252         if (vp->v_vflag & VV_SYSTEM)
 4253                 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 4254         if (vp->v_vflag & VV_PROCDEP)
 4255                 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 4256         if (vp->v_vflag & VV_NOKNOTE)
 4257                 strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
 4258         if (vp->v_vflag & VV_DELETED)
 4259                 strlcat(buf, "|VV_DELETED", sizeof(buf));
 4260         if (vp->v_vflag & VV_MD)
 4261                 strlcat(buf, "|VV_MD", sizeof(buf));
 4262         if (vp->v_vflag & VV_FORCEINSMQ)
 4263                 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 4264         if (vp->v_vflag & VV_READLINK)
 4265                 strlcat(buf, "|VV_READLINK", sizeof(buf));
 4266         flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 4267             VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM |
 4268             VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ |
 4269             VV_READLINK);
 4270         if (flags != 0) {
 4271                 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 4272                 strlcat(buf, buf2, sizeof(buf));
 4273         }
 4274         if (vp->v_iflag & VI_TEXT_REF)
 4275                 strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
 4276         if (vp->v_iflag & VI_MOUNT)
 4277                 strlcat(buf, "|VI_MOUNT", sizeof(buf));
 4278         if (vp->v_iflag & VI_DOINGINACT)
 4279                 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 4280         if (vp->v_iflag & VI_OWEINACT)
 4281                 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 4282         if (vp->v_iflag & VI_DEFINACT)
 4283                 strlcat(buf, "|VI_DEFINACT", sizeof(buf));
 4284         if (vp->v_iflag & VI_FOPENING)
 4285                 strlcat(buf, "|VI_FOPENING", sizeof(buf));
 4286         flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT |
 4287             VI_OWEINACT | VI_DEFINACT | VI_FOPENING);
 4288         if (flags != 0) {
 4289                 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 4290                 strlcat(buf, buf2, sizeof(buf));
 4291         }
 4292         if (vp->v_mflag & VMP_LAZYLIST)
 4293                 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
 4294         flags = vp->v_mflag & ~(VMP_LAZYLIST);
 4295         if (flags != 0) {
 4296                 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
 4297                 strlcat(buf, buf2, sizeof(buf));
 4298         }
 4299         printf("    flags (%s)", buf + 1);
 4300         if (mtx_owned(VI_MTX(vp)))
 4301                 printf(" VI_LOCKed");
 4302         printf("\n");
 4303         if (vp->v_object != NULL)
 4304                 printf("    v_object %p ref %d pages %d "
 4305                     "cleanbuf %d dirtybuf %d\n",
 4306                     vp->v_object, vp->v_object->ref_count,
 4307                     vp->v_object->resident_page_count,
 4308                     vp->v_bufobj.bo_clean.bv_cnt,
 4309                     vp->v_bufobj.bo_dirty.bv_cnt);
 4310         printf("    ");
 4311         lockmgr_printinfo(vp->v_vnlock);
 4312         if (vp->v_data != NULL)
 4313                 VOP_PRINT(vp);
 4314 }
 4315 
 4316 #ifdef DDB
 4317 /*
 4318  * List all of the locked vnodes in the system.
 4319  * Called when debugging the kernel.
 4320  */
 4321 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 4322 {
 4323         struct mount *mp;
 4324         struct vnode *vp;
 4325 
 4326         /*
 4327          * Note: because this is DDB, we can't obey the locking semantics
 4328          * for these structures, which means we could catch an inconsistent
 4329          * state and dereference a nasty pointer.  Not much to be done
 4330          * about that.
 4331          */
 4332         db_printf("Locked vnodes\n");
 4333         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 4334                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4335                         if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 4336                                 vn_printf(vp, "vnode ");
 4337                 }
 4338         }
 4339 }
 4340 
 4341 /*
 4342  * Show details about the given vnode.
 4343  */
 4344 DB_SHOW_COMMAND(vnode, db_show_vnode)
 4345 {
 4346         struct vnode *vp;
 4347 
 4348         if (!have_addr)
 4349                 return;
 4350         vp = (struct vnode *)addr;
 4351         vn_printf(vp, "vnode ");
 4352 }
 4353 
 4354 /*
 4355  * Show details about the given mount point.
 4356  */
 4357 DB_SHOW_COMMAND(mount, db_show_mount)
 4358 {
 4359         struct mount *mp;
 4360         struct vfsopt *opt;
 4361         struct statfs *sp;
 4362         struct vnode *vp;
 4363         char buf[512];
 4364         uint64_t mflags;
 4365         u_int flags;
 4366 
 4367         if (!have_addr) {
 4368                 /* No address given, print short info about all mount points. */
 4369                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 4370                         db_printf("%p %s on %s (%s)\n", mp,
 4371                             mp->mnt_stat.f_mntfromname,
 4372                             mp->mnt_stat.f_mntonname,
 4373                             mp->mnt_stat.f_fstypename);
 4374                         if (db_pager_quit)
 4375                                 break;
 4376                 }
 4377                 db_printf("\nMore info: show mount <addr>\n");
 4378                 return;
 4379         }
 4380 
 4381         mp = (struct mount *)addr;
 4382         db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 4383             mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 4384 
 4385         buf[0] = '\0';
 4386         mflags = mp->mnt_flag;
 4387 #define MNT_FLAG(flag)  do {                                            \
 4388         if (mflags & (flag)) {                                          \
 4389                 if (buf[0] != '\0')                                     \
 4390                         strlcat(buf, ", ", sizeof(buf));                \
 4391                 strlcat(buf, (#flag) + 4, sizeof(buf));                 \
 4392                 mflags &= ~(flag);                                      \
 4393         }                                                               \
 4394 } while (0)
 4395         MNT_FLAG(MNT_RDONLY);
 4396         MNT_FLAG(MNT_SYNCHRONOUS);
 4397         MNT_FLAG(MNT_NOEXEC);
 4398         MNT_FLAG(MNT_NOSUID);
 4399         MNT_FLAG(MNT_NFS4ACLS);
 4400         MNT_FLAG(MNT_UNION);
 4401         MNT_FLAG(MNT_ASYNC);
 4402         MNT_FLAG(MNT_SUIDDIR);
 4403         MNT_FLAG(MNT_SOFTDEP);
 4404         MNT_FLAG(MNT_NOSYMFOLLOW);
 4405         MNT_FLAG(MNT_GJOURNAL);
 4406         MNT_FLAG(MNT_MULTILABEL);
 4407         MNT_FLAG(MNT_ACLS);
 4408         MNT_FLAG(MNT_NOATIME);
 4409         MNT_FLAG(MNT_NOCLUSTERR);
 4410         MNT_FLAG(MNT_NOCLUSTERW);
 4411         MNT_FLAG(MNT_SUJ);
 4412         MNT_FLAG(MNT_EXRDONLY);
 4413         MNT_FLAG(MNT_EXPORTED);
 4414         MNT_FLAG(MNT_DEFEXPORTED);
 4415         MNT_FLAG(MNT_EXPORTANON);
 4416         MNT_FLAG(MNT_EXKERB);
 4417         MNT_FLAG(MNT_EXPUBLIC);
 4418         MNT_FLAG(MNT_LOCAL);
 4419         MNT_FLAG(MNT_QUOTA);
 4420         MNT_FLAG(MNT_ROOTFS);
 4421         MNT_FLAG(MNT_USER);
 4422         MNT_FLAG(MNT_IGNORE);
 4423         MNT_FLAG(MNT_UPDATE);
 4424         MNT_FLAG(MNT_DELEXPORT);
 4425         MNT_FLAG(MNT_RELOAD);
 4426         MNT_FLAG(MNT_FORCE);
 4427         MNT_FLAG(MNT_SNAPSHOT);
 4428         MNT_FLAG(MNT_BYFSID);
 4429 #undef MNT_FLAG
 4430         if (mflags != 0) {
 4431                 if (buf[0] != '\0')
 4432                         strlcat(buf, ", ", sizeof(buf));
 4433                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 4434                     "0x%016jx", mflags);
 4435         }
 4436         db_printf("    mnt_flag = %s\n", buf);
 4437 
 4438         buf[0] = '\0';
 4439         flags = mp->mnt_kern_flag;
 4440 #define MNT_KERN_FLAG(flag)     do {                                    \
 4441         if (flags & (flag)) {                                           \
 4442                 if (buf[0] != '\0')                                     \
 4443                         strlcat(buf, ", ", sizeof(buf));                \
 4444                 strlcat(buf, (#flag) + 5, sizeof(buf));                 \
 4445                 flags &= ~(flag);                                       \
 4446         }                                                               \
 4447 } while (0)
 4448         MNT_KERN_FLAG(MNTK_UNMOUNTF);
 4449         MNT_KERN_FLAG(MNTK_ASYNC);
 4450         MNT_KERN_FLAG(MNTK_SOFTDEP);
 4451         MNT_KERN_FLAG(MNTK_DRAINING);
 4452         MNT_KERN_FLAG(MNTK_REFEXPIRE);
 4453         MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 4454         MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 4455         MNT_KERN_FLAG(MNTK_NO_IOPF);
 4456         MNT_KERN_FLAG(MNTK_VGONE_UPPER);
 4457         MNT_KERN_FLAG(MNTK_VGONE_WAITER);
 4458         MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
 4459         MNT_KERN_FLAG(MNTK_MARKER);
 4460         MNT_KERN_FLAG(MNTK_USES_BCACHE);
 4461         MNT_KERN_FLAG(MNTK_FPLOOKUP);
 4462         MNT_KERN_FLAG(MNTK_NOASYNC);
 4463         MNT_KERN_FLAG(MNTK_UNMOUNT);
 4464         MNT_KERN_FLAG(MNTK_MWAIT);
 4465         MNT_KERN_FLAG(MNTK_SUSPEND);
 4466         MNT_KERN_FLAG(MNTK_SUSPEND2);
 4467         MNT_KERN_FLAG(MNTK_SUSPENDED);
 4468         MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 4469         MNT_KERN_FLAG(MNTK_NOKNOTE);
 4470 #undef MNT_KERN_FLAG
 4471         if (flags != 0) {
 4472                 if (buf[0] != '\0')
 4473                         strlcat(buf, ", ", sizeof(buf));
 4474                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 4475                     "0x%08x", flags);
 4476         }
 4477         db_printf("    mnt_kern_flag = %s\n", buf);
 4478 
 4479         db_printf("    mnt_opt = ");
 4480         opt = TAILQ_FIRST(mp->mnt_opt);
 4481         if (opt != NULL) {
 4482                 db_printf("%s", opt->name);
 4483                 opt = TAILQ_NEXT(opt, link);
 4484                 while (opt != NULL) {
 4485                         db_printf(", %s", opt->name);
 4486                         opt = TAILQ_NEXT(opt, link);
 4487                 }
 4488         }
 4489         db_printf("\n");
 4490 
 4491         sp = &mp->mnt_stat;
 4492         db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 4493             "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 4494             "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 4495             "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 4496             (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 4497             (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 4498             (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 4499             (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 4500             (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 4501             (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 4502             (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 4503             (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 4504 
 4505         db_printf("    mnt_cred = { uid=%u ruid=%u",
 4506             (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 4507         if (jailed(mp->mnt_cred))
 4508                 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 4509         db_printf(" }\n");
 4510         db_printf("    mnt_ref = %d (with %d in the struct)\n",
 4511             vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
 4512         db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 4513         db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 4514         db_printf("    mnt_lazyvnodelistsize = %d\n",
 4515             mp->mnt_lazyvnodelistsize);
 4516         db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
 4517             vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
 4518         db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 4519         db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 4520         db_printf("    mnt_lockref = %d (with %d in the struct)\n",
 4521             vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
 4522         db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 4523         db_printf("    mnt_secondary_accwrites = %d\n",
 4524             mp->mnt_secondary_accwrites);
 4525         db_printf("    mnt_gjprovider = %s\n",
 4526             mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 4527         db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
 4528 
 4529         db_printf("\n\nList of active vnodes\n");
 4530         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4531                 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
 4532                         vn_printf(vp, "vnode ");
 4533                         if (db_pager_quit)
 4534                                 break;
 4535                 }
 4536         }
 4537         db_printf("\n\nList of inactive vnodes\n");
 4538         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4539                 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
 4540                         vn_printf(vp, "vnode ");
 4541                         if (db_pager_quit)
 4542                                 break;
 4543                 }
 4544         }
 4545 }
 4546 #endif  /* DDB */
 4547 
 4548 /*
 4549  * Fill in a struct xvfsconf based on a struct vfsconf.
 4550  */
 4551 static int
 4552 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 4553 {
 4554         struct xvfsconf xvfsp;
 4555 
 4556         bzero(&xvfsp, sizeof(xvfsp));
 4557         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 4558         xvfsp.vfc_typenum = vfsp->vfc_typenum;
 4559         xvfsp.vfc_refcount = vfsp->vfc_refcount;
 4560         xvfsp.vfc_flags = vfsp->vfc_flags;
 4561         /*
 4562          * These are unused in userland, we keep them
 4563          * to not break binary compatibility.
 4564          */
 4565         xvfsp.vfc_vfsops = NULL;
 4566         xvfsp.vfc_next = NULL;
 4567         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 4568 }
 4569 
 4570 #ifdef COMPAT_FREEBSD32
 4571 struct xvfsconf32 {
 4572         uint32_t        vfc_vfsops;
 4573         char            vfc_name[MFSNAMELEN];
 4574         int32_t         vfc_typenum;
 4575         int32_t         vfc_refcount;
 4576         int32_t         vfc_flags;
 4577         uint32_t        vfc_next;
 4578 };
 4579 
 4580 static int
 4581 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 4582 {
 4583         struct xvfsconf32 xvfsp;
 4584 
 4585         bzero(&xvfsp, sizeof(xvfsp));
 4586         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 4587         xvfsp.vfc_typenum = vfsp->vfc_typenum;
 4588         xvfsp.vfc_refcount = vfsp->vfc_refcount;
 4589         xvfsp.vfc_flags = vfsp->vfc_flags;
 4590         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 4591 }
 4592 #endif
 4593 
 4594 /*
 4595  * Top level filesystem related information gathering.
 4596  */
 4597 static int
 4598 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 4599 {
 4600         struct vfsconf *vfsp;
 4601         int error;
 4602 
 4603         error = 0;
 4604         vfsconf_slock();
 4605         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 4606 #ifdef COMPAT_FREEBSD32
 4607                 if (req->flags & SCTL_MASK32)
 4608                         error = vfsconf2x32(req, vfsp);
 4609                 else
 4610 #endif
 4611                         error = vfsconf2x(req, vfsp);
 4612                 if (error)
 4613                         break;
 4614         }
 4615         vfsconf_sunlock();
 4616         return (error);
 4617 }
 4618 
 4619 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
 4620     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
 4621     "S,xvfsconf", "List of all configured filesystems");
 4622 
 4623 #ifndef BURN_BRIDGES
 4624 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 4625 
 4626 static int
 4627 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 4628 {
 4629         int *name = (int *)arg1 - 1;    /* XXX */
 4630         u_int namelen = arg2 + 1;       /* XXX */
 4631         struct vfsconf *vfsp;
 4632 
 4633         log(LOG_WARNING, "userland calling deprecated sysctl, "
 4634             "please rebuild world\n");
 4635 
 4636 #if 1 || defined(COMPAT_PRELITE2)
 4637         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 4638         if (namelen == 1)
 4639                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 4640 #endif
 4641 
 4642         switch (name[1]) {
 4643         case VFS_MAXTYPENUM:
 4644                 if (namelen != 2)
 4645                         return (ENOTDIR);
 4646                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 4647         case VFS_CONF:
 4648                 if (namelen != 3)
 4649                         return (ENOTDIR);       /* overloaded */
 4650                 vfsconf_slock();
 4651                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 4652                         if (vfsp->vfc_typenum == name[2])
 4653                                 break;
 4654                 }
 4655                 vfsconf_sunlock();
 4656                 if (vfsp == NULL)
 4657                         return (EOPNOTSUPP);
 4658 #ifdef COMPAT_FREEBSD32
 4659                 if (req->flags & SCTL_MASK32)
 4660                         return (vfsconf2x32(req, vfsp));
 4661                 else
 4662 #endif
 4663                         return (vfsconf2x(req, vfsp));
 4664         }
 4665         return (EOPNOTSUPP);
 4666 }
 4667 
 4668 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
 4669     CTLFLAG_MPSAFE, vfs_sysctl,
 4670     "Generic filesystem");
 4671 
 4672 #if 1 || defined(COMPAT_PRELITE2)
 4673 
 4674 static int
 4675 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 4676 {
 4677         int error;
 4678         struct vfsconf *vfsp;
 4679         struct ovfsconf ovfs;
 4680 
 4681         vfsconf_slock();
 4682         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 4683                 bzero(&ovfs, sizeof(ovfs));
 4684                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
 4685                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
 4686                 ovfs.vfc_index = vfsp->vfc_typenum;
 4687                 ovfs.vfc_refcount = vfsp->vfc_refcount;
 4688                 ovfs.vfc_flags = vfsp->vfc_flags;
 4689                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 4690                 if (error != 0) {
 4691                         vfsconf_sunlock();
 4692                         return (error);
 4693                 }
 4694         }
 4695         vfsconf_sunlock();
 4696         return (0);
 4697 }
 4698 
 4699 #endif /* 1 || COMPAT_PRELITE2 */
 4700 #endif /* !BURN_BRIDGES */
 4701 
 4702 #define KINFO_VNODESLOP         10
 4703 #ifdef notyet
 4704 /*
 4705  * Dump vnode list (via sysctl).
 4706  */
 4707 /* ARGSUSED */
 4708 static int
 4709 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 4710 {
 4711         struct xvnode *xvn;
 4712         struct mount *mp;
 4713         struct vnode *vp;
 4714         int error, len, n;
 4715 
 4716         /*
 4717          * Stale numvnodes access is not fatal here.
 4718          */
 4719         req->lock = 0;
 4720         len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 4721         if (!req->oldptr)
 4722                 /* Make an estimate */
 4723                 return (SYSCTL_OUT(req, 0, len));
 4724 
 4725         error = sysctl_wire_old_buffer(req, 0);
 4726         if (error != 0)
 4727                 return (error);
 4728         xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 4729         n = 0;
 4730         mtx_lock(&mountlist_mtx);
 4731         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 4732                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 4733                         continue;
 4734                 MNT_ILOCK(mp);
 4735                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4736                         if (n == len)
 4737                                 break;
 4738                         vref(vp);
 4739                         xvn[n].xv_size = sizeof *xvn;
 4740                         xvn[n].xv_vnode = vp;
 4741                         xvn[n].xv_id = 0;       /* XXX compat */
 4742 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 4743                         XV_COPY(usecount);
 4744                         XV_COPY(writecount);
 4745                         XV_COPY(holdcnt);
 4746                         XV_COPY(mount);
 4747                         XV_COPY(numoutput);
 4748                         XV_COPY(type);
 4749 #undef XV_COPY
 4750                         xvn[n].xv_flag = vp->v_vflag;
 4751 
 4752                         switch (vp->v_type) {
 4753                         case VREG:
 4754                         case VDIR:
 4755                         case VLNK:
 4756                                 break;
 4757                         case VBLK:
 4758                         case VCHR:
 4759                                 if (vp->v_rdev == NULL) {
 4760                                         vrele(vp);
 4761                                         continue;
 4762                                 }
 4763                                 xvn[n].xv_dev = dev2udev(vp->v_rdev);
 4764                                 break;
 4765                         case VSOCK:
 4766                                 xvn[n].xv_socket = vp->v_socket;
 4767                                 break;
 4768                         case VFIFO:
 4769                                 xvn[n].xv_fifo = vp->v_fifoinfo;
 4770                                 break;
 4771                         case VNON:
 4772                         case VBAD:
 4773                         default:
 4774                                 /* shouldn't happen? */
 4775                                 vrele(vp);
 4776                                 continue;
 4777                         }
 4778                         vrele(vp);
 4779                         ++n;
 4780                 }
 4781                 MNT_IUNLOCK(mp);
 4782                 mtx_lock(&mountlist_mtx);
 4783                 vfs_unbusy(mp);
 4784                 if (n == len)
 4785                         break;
 4786         }
 4787         mtx_unlock(&mountlist_mtx);
 4788 
 4789         error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 4790         free(xvn, M_TEMP);
 4791         return (error);
 4792 }
 4793 
 4794 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
 4795     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
 4796     "");
 4797 #endif
 4798 
 4799 static void
 4800 unmount_or_warn(struct mount *mp)
 4801 {
 4802         int error;
 4803 
 4804         error = dounmount(mp, MNT_FORCE, curthread);
 4805         if (error != 0) {
 4806                 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 4807                 if (error == EBUSY)
 4808                         printf("BUSY)\n");
 4809                 else
 4810                         printf("%d)\n", error);
 4811         }
 4812 }
 4813 
 4814 /*
 4815  * Unmount all filesystems. The list is traversed in reverse order
 4816  * of mounting to avoid dependencies.
 4817  */
 4818 void
 4819 vfs_unmountall(void)
 4820 {
 4821         struct mount *mp, *tmp;
 4822 
 4823         CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 4824 
 4825         /*
 4826          * Since this only runs when rebooting, it is not interlocked.
 4827          */
 4828         TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
 4829                 vfs_ref(mp);
 4830 
 4831                 /*
 4832                  * Forcibly unmounting "/dev" before "/" would prevent clean
 4833                  * unmount of the latter.
 4834                  */
 4835                 if (mp == rootdevmp)
 4836                         continue;
 4837 
 4838                 unmount_or_warn(mp);
 4839         }
 4840 
 4841         if (rootdevmp != NULL)
 4842                 unmount_or_warn(rootdevmp);
 4843 }
 4844 
 4845 static void
 4846 vfs_deferred_inactive(struct vnode *vp, int lkflags)
 4847 {
 4848 
 4849         ASSERT_VI_LOCKED(vp, __func__);
 4850         VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set"));
 4851         if ((vp->v_iflag & VI_OWEINACT) == 0) {
 4852                 vdropl(vp);
 4853                 return;
 4854         }
 4855         if (vn_lock(vp, lkflags) == 0) {
 4856                 VI_LOCK(vp);
 4857                 vinactive(vp);
 4858                 VOP_UNLOCK(vp);
 4859                 vdropl(vp);
 4860                 return;
 4861         }
 4862         vdefer_inactive_unlocked(vp);
 4863 }
 4864 
 4865 static int
 4866 vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
 4867 {
 4868 
 4869         return (vp->v_iflag & VI_DEFINACT);
 4870 }
 4871 
 4872 static void __noinline
 4873 vfs_periodic_inactive(struct mount *mp, int flags)
 4874 {
 4875         struct vnode *vp, *mvp;
 4876         int lkflags;
 4877 
 4878         lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 4879         if (flags != MNT_WAIT)
 4880                 lkflags |= LK_NOWAIT;
 4881 
 4882         MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
 4883                 if ((vp->v_iflag & VI_DEFINACT) == 0) {
 4884                         VI_UNLOCK(vp);
 4885                         continue;
 4886                 }
 4887                 vp->v_iflag &= ~VI_DEFINACT;
 4888                 vfs_deferred_inactive(vp, lkflags);
 4889         }
 4890 }
 4891 
 4892 static inline bool
 4893 vfs_want_msync(struct vnode *vp)
 4894 {
 4895         struct vm_object *obj;
 4896 
 4897         /*
 4898          * This test may be performed without any locks held.
 4899          * We rely on vm_object's type stability.
 4900          */
 4901         if (vp->v_vflag & VV_NOSYNC)
 4902                 return (false);
 4903         obj = vp->v_object;
 4904         return (obj != NULL && vm_object_mightbedirty(obj));
 4905 }
 4906 
 4907 static int
 4908 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
 4909 {
 4910 
 4911         if (vp->v_vflag & VV_NOSYNC)
 4912                 return (false);
 4913         if (vp->v_iflag & VI_DEFINACT)
 4914                 return (true);
 4915         return (vfs_want_msync(vp));
 4916 }
 4917 
 4918 static void __noinline
 4919 vfs_periodic_msync_inactive(struct mount *mp, int flags)
 4920 {
 4921         struct vnode *vp, *mvp;
 4922         struct vm_object *obj;
 4923         int lkflags, objflags;
 4924         bool seen_defer;
 4925 
 4926         lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 4927         if (flags != MNT_WAIT) {
 4928                 lkflags |= LK_NOWAIT;
 4929                 objflags = OBJPC_NOSYNC;
 4930         } else {
 4931                 objflags = OBJPC_SYNC;
 4932         }
 4933 
 4934         MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
 4935                 seen_defer = false;
 4936                 if (vp->v_iflag & VI_DEFINACT) {
 4937                         vp->v_iflag &= ~VI_DEFINACT;
 4938                         seen_defer = true;
 4939                 }
 4940                 if (!vfs_want_msync(vp)) {
 4941                         if (seen_defer)
 4942                                 vfs_deferred_inactive(vp, lkflags);
 4943                         else
 4944                                 VI_UNLOCK(vp);
 4945                         continue;
 4946                 }
 4947                 if (vget(vp, lkflags) == 0) {
 4948                         obj = vp->v_object;
 4949                         if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
 4950                                 VM_OBJECT_WLOCK(obj);
 4951                                 vm_object_page_clean(obj, 0, 0, objflags);
 4952                                 VM_OBJECT_WUNLOCK(obj);
 4953                         }
 4954                         vput(vp);
 4955                         if (seen_defer)
 4956                                 vdrop(vp);
 4957                 } else {
 4958                         if (seen_defer)
 4959                                 vdefer_inactive_unlocked(vp);
 4960                 }
 4961         }
 4962 }
 4963 
 4964 void
 4965 vfs_periodic(struct mount *mp, int flags)
 4966 {
 4967 
 4968         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 4969 
 4970         if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
 4971                 vfs_periodic_inactive(mp, flags);
 4972         else
 4973                 vfs_periodic_msync_inactive(mp, flags);
 4974 }
 4975 
 4976 static void
 4977 destroy_vpollinfo_free(struct vpollinfo *vi)
 4978 {
 4979 
 4980         knlist_destroy(&vi->vpi_selinfo.si_note);
 4981         mtx_destroy(&vi->vpi_lock);
 4982         free(vi, M_VNODEPOLL);
 4983 }
 4984 
 4985 static void
 4986 destroy_vpollinfo(struct vpollinfo *vi)
 4987 {
 4988 
 4989         knlist_clear(&vi->vpi_selinfo.si_note, 1);
 4990         seldrain(&vi->vpi_selinfo);
 4991         destroy_vpollinfo_free(vi);
 4992 }
 4993 
 4994 /*
 4995  * Initialize per-vnode helper structure to hold poll-related state.
 4996  */
 4997 void
 4998 v_addpollinfo(struct vnode *vp)
 4999 {
 5000         struct vpollinfo *vi;
 5001 
 5002         if (vp->v_pollinfo != NULL)
 5003                 return;
 5004         vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
 5005         mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 5006         knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 5007             vfs_knlunlock, vfs_knl_assert_lock);
 5008         VI_LOCK(vp);
 5009         if (vp->v_pollinfo != NULL) {
 5010                 VI_UNLOCK(vp);
 5011                 destroy_vpollinfo_free(vi);
 5012                 return;
 5013         }
 5014         vp->v_pollinfo = vi;
 5015         VI_UNLOCK(vp);
 5016 }
 5017 
 5018 /*
 5019  * Record a process's interest in events which might happen to
 5020  * a vnode.  Because poll uses the historic select-style interface
 5021  * internally, this routine serves as both the ``check for any
 5022  * pending events'' and the ``record my interest in future events''
 5023  * functions.  (These are done together, while the lock is held,
 5024  * to avoid race conditions.)
 5025  */
 5026 int
 5027 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 5028 {
 5029 
 5030         v_addpollinfo(vp);
 5031         mtx_lock(&vp->v_pollinfo->vpi_lock);
 5032         if (vp->v_pollinfo->vpi_revents & events) {
 5033                 /*
 5034                  * This leaves events we are not interested
 5035                  * in available for the other process which
 5036                  * which presumably had requested them
 5037                  * (otherwise they would never have been
 5038                  * recorded).
 5039                  */
 5040                 events &= vp->v_pollinfo->vpi_revents;
 5041                 vp->v_pollinfo->vpi_revents &= ~events;
 5042 
 5043                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
 5044                 return (events);
 5045         }
 5046         vp->v_pollinfo->vpi_events |= events;
 5047         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 5048         mtx_unlock(&vp->v_pollinfo->vpi_lock);
 5049         return (0);
 5050 }
 5051 
 5052 /*
 5053  * Routine to create and manage a filesystem syncer vnode.
 5054  */
 5055 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 5056 static int      sync_fsync(struct  vop_fsync_args *);
 5057 static int      sync_inactive(struct  vop_inactive_args *);
 5058 static int      sync_reclaim(struct  vop_reclaim_args *);
 5059 
 5060 static struct vop_vector sync_vnodeops = {
 5061         .vop_bypass =   VOP_EOPNOTSUPP,
 5062         .vop_close =    sync_close,             /* close */
 5063         .vop_fsync =    sync_fsync,             /* fsync */
 5064         .vop_inactive = sync_inactive,  /* inactive */
 5065         .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
 5066         .vop_reclaim =  sync_reclaim,   /* reclaim */
 5067         .vop_lock1 =    vop_stdlock,    /* lock */
 5068         .vop_unlock =   vop_stdunlock,  /* unlock */
 5069         .vop_islocked = vop_stdislocked,        /* islocked */
 5070 };
 5071 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
 5072 
 5073 /*
 5074  * Create a new filesystem syncer vnode for the specified mount point.
 5075  */
 5076 void
 5077 vfs_allocate_syncvnode(struct mount *mp)
 5078 {
 5079         struct vnode *vp;
 5080         struct bufobj *bo;
 5081         static long start, incr, next;
 5082         int error;
 5083 
 5084         /* Allocate a new vnode */
 5085         error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 5086         if (error != 0)
 5087                 panic("vfs_allocate_syncvnode: getnewvnode() failed");
 5088         vp->v_type = VNON;
 5089         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 5090         vp->v_vflag |= VV_FORCEINSMQ;
 5091         error = insmntque(vp, mp);
 5092         if (error != 0)
 5093                 panic("vfs_allocate_syncvnode: insmntque() failed");
 5094         vp->v_vflag &= ~VV_FORCEINSMQ;
 5095         VOP_UNLOCK(vp);
 5096         /*
 5097          * Place the vnode onto the syncer worklist. We attempt to
 5098          * scatter them about on the list so that they will go off
 5099          * at evenly distributed times even if all the filesystems
 5100          * are mounted at once.
 5101          */
 5102         next += incr;
 5103         if (next == 0 || next > syncer_maxdelay) {
 5104                 start /= 2;
 5105                 incr /= 2;
 5106                 if (start == 0) {
 5107                         start = syncer_maxdelay / 2;
 5108                         incr = syncer_maxdelay;
 5109                 }
 5110                 next = start;
 5111         }
 5112         bo = &vp->v_bufobj;
 5113         BO_LOCK(bo);
 5114         vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 5115         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 5116         mtx_lock(&sync_mtx);
 5117         sync_vnode_count++;
 5118         if (mp->mnt_syncer == NULL) {
 5119                 mp->mnt_syncer = vp;
 5120                 vp = NULL;
 5121         }
 5122         mtx_unlock(&sync_mtx);
 5123         BO_UNLOCK(bo);
 5124         if (vp != NULL) {
 5125                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 5126                 vgone(vp);
 5127                 vput(vp);
 5128         }
 5129 }
 5130 
 5131 void
 5132 vfs_deallocate_syncvnode(struct mount *mp)
 5133 {
 5134         struct vnode *vp;
 5135 
 5136         mtx_lock(&sync_mtx);
 5137         vp = mp->mnt_syncer;
 5138         if (vp != NULL)
 5139                 mp->mnt_syncer = NULL;
 5140         mtx_unlock(&sync_mtx);
 5141         if (vp != NULL)
 5142                 vrele(vp);
 5143 }
 5144 
 5145 /*
 5146  * Do a lazy sync of the filesystem.
 5147  */
 5148 static int
 5149 sync_fsync(struct vop_fsync_args *ap)
 5150 {
 5151         struct vnode *syncvp = ap->a_vp;
 5152         struct mount *mp = syncvp->v_mount;
 5153         int error, save;
 5154         struct bufobj *bo;
 5155 
 5156         /*
 5157          * We only need to do something if this is a lazy evaluation.
 5158          */
 5159         if (ap->a_waitfor != MNT_LAZY)
 5160                 return (0);
 5161 
 5162         /*
 5163          * Move ourselves to the back of the sync list.
 5164          */
 5165         bo = &syncvp->v_bufobj;
 5166         BO_LOCK(bo);
 5167         vn_syncer_add_to_worklist(bo, syncdelay);
 5168         BO_UNLOCK(bo);
 5169 
 5170         /*
 5171          * Walk the list of vnodes pushing all that are dirty and
 5172          * not already on the sync list.
 5173          */
 5174         if (vfs_busy(mp, MBF_NOWAIT) != 0)
 5175                 return (0);
 5176         VOP_UNLOCK(syncvp);
 5177         save = curthread_pflags_set(TDP_SYNCIO);
 5178         /*
 5179          * The filesystem at hand may be idle with free vnodes stored in the
 5180          * batch.  Return them instead of letting them stay there indefinitely.
 5181          */
 5182         vfs_periodic(mp, MNT_NOWAIT);
 5183         error = VFS_SYNC(mp, MNT_LAZY);
 5184         curthread_pflags_restore(save);
 5185         vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY);
 5186         vfs_unbusy(mp);
 5187         return (error);
 5188 }
 5189 
 5190 /*
 5191  * The syncer vnode is no referenced.
 5192  */
 5193 static int
 5194 sync_inactive(struct vop_inactive_args *ap)
 5195 {
 5196 
 5197         vgone(ap->a_vp);
 5198         return (0);
 5199 }
 5200 
 5201 /*
 5202  * The syncer vnode is no longer needed and is being decommissioned.
 5203  *
 5204  * Modifications to the worklist must be protected by sync_mtx.
 5205  */
 5206 static int
 5207 sync_reclaim(struct vop_reclaim_args *ap)
 5208 {
 5209         struct vnode *vp = ap->a_vp;
 5210         struct bufobj *bo;
 5211 
 5212         bo = &vp->v_bufobj;
 5213         BO_LOCK(bo);
 5214         mtx_lock(&sync_mtx);
 5215         if (vp->v_mount->mnt_syncer == vp)
 5216                 vp->v_mount->mnt_syncer = NULL;
 5217         if (bo->bo_flag & BO_ONWORKLST) {
 5218                 LIST_REMOVE(bo, bo_synclist);
 5219                 syncer_worklist_len--;
 5220                 sync_vnode_count--;
 5221                 bo->bo_flag &= ~BO_ONWORKLST;
 5222         }
 5223         mtx_unlock(&sync_mtx);
 5224         BO_UNLOCK(bo);
 5225 
 5226         return (0);
 5227 }
 5228 
 5229 int
 5230 vn_need_pageq_flush(struct vnode *vp)
 5231 {
 5232         struct vm_object *obj;
 5233 
 5234         obj = vp->v_object;
 5235         return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 5236             vm_object_mightbedirty(obj));
 5237 }
 5238 
 5239 /*
 5240  * Check if vnode represents a disk device
 5241  */
 5242 bool
 5243 vn_isdisk_error(struct vnode *vp, int *errp)
 5244 {
 5245         int error;
 5246 
 5247         if (vp->v_type != VCHR) {
 5248                 error = ENOTBLK;
 5249                 goto out;
 5250         }
 5251         error = 0;
 5252         dev_lock();
 5253         if (vp->v_rdev == NULL)
 5254                 error = ENXIO;
 5255         else if (vp->v_rdev->si_devsw == NULL)
 5256                 error = ENXIO;
 5257         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 5258                 error = ENOTBLK;
 5259         dev_unlock();
 5260 out:
 5261         *errp = error;
 5262         return (error == 0);
 5263 }
 5264 
 5265 bool
 5266 vn_isdisk(struct vnode *vp)
 5267 {
 5268         int error;
 5269 
 5270         return (vn_isdisk_error(vp, &error));
 5271 }
 5272 
 5273 /*
 5274  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
 5275  * the comment above cache_fplookup for details.
 5276  */
 5277 int
 5278 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
 5279 {
 5280         int error;
 5281 
 5282         VFS_SMR_ASSERT_ENTERED();
 5283 
 5284         /* Check the owner. */
 5285         if (cred->cr_uid == file_uid) {
 5286                 if (file_mode & S_IXUSR)
 5287                         return (0);
 5288                 goto out_error;
 5289         }
 5290 
 5291         /* Otherwise, check the groups (first match) */
 5292         if (groupmember(file_gid, cred)) {
 5293                 if (file_mode & S_IXGRP)
 5294                         return (0);
 5295                 goto out_error;
 5296         }
 5297 
 5298         /* Otherwise, check everyone else. */
 5299         if (file_mode & S_IXOTH)
 5300                 return (0);
 5301 out_error:
 5302         /*
 5303          * Permission check failed, but it is possible denial will get overwritten
 5304          * (e.g., when root is traversing through a 700 directory owned by someone
 5305          * else).
 5306          *
 5307          * vaccess() calls priv_check_cred which in turn can descent into MAC
 5308          * modules overriding this result. It's quite unclear what semantics
 5309          * are allowed for them to operate, thus for safety we don't call them
 5310          * from within the SMR section. This also means if any such modules
 5311          * are present, we have to let the regular lookup decide.
 5312          */
 5313         error = priv_check_cred_vfs_lookup_nomac(cred);
 5314         switch (error) {
 5315         case 0:
 5316                 return (0);
 5317         case EAGAIN:
 5318                 /*
 5319                  * MAC modules present.
 5320                  */
 5321                 return (EAGAIN);
 5322         case EPERM:
 5323                 return (EACCES);
 5324         default:
 5325                 return (error);
 5326         }
 5327 }
 5328 
 5329 /*
 5330  * Common filesystem object access control check routine.  Accepts a
 5331  * vnode's type, "mode", uid and gid, requested access mode, and credentials.
 5332  * Returns 0 on success, or an errno on failure.
 5333  */
 5334 int
 5335 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
 5336     accmode_t accmode, struct ucred *cred)
 5337 {
 5338         accmode_t dac_granted;
 5339         accmode_t priv_granted;
 5340 
 5341         KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 5342             ("invalid bit in accmode"));
 5343         KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 5344             ("VAPPEND without VWRITE"));
 5345 
 5346         /*
 5347          * Look for a normal, non-privileged way to access the file/directory
 5348          * as requested.  If it exists, go with that.
 5349          */
 5350 
 5351         dac_granted = 0;
 5352 
 5353         /* Check the owner. */
 5354         if (cred->cr_uid == file_uid) {
 5355                 dac_granted |= VADMIN;
 5356                 if (file_mode & S_IXUSR)
 5357                         dac_granted |= VEXEC;
 5358                 if (file_mode & S_IRUSR)
 5359                         dac_granted |= VREAD;
 5360                 if (file_mode & S_IWUSR)
 5361                         dac_granted |= (VWRITE | VAPPEND);
 5362 
 5363                 if ((accmode & dac_granted) == accmode)
 5364                         return (0);
 5365 
 5366                 goto privcheck;
 5367         }
 5368 
 5369         /* Otherwise, check the groups (first match) */
 5370         if (groupmember(file_gid, cred)) {
 5371                 if (file_mode & S_IXGRP)
 5372                         dac_granted |= VEXEC;
 5373                 if (file_mode & S_IRGRP)
 5374                         dac_granted |= VREAD;
 5375                 if (file_mode & S_IWGRP)
 5376                         dac_granted |= (VWRITE | VAPPEND);
 5377 
 5378                 if ((accmode & dac_granted) == accmode)
 5379                         return (0);
 5380 
 5381                 goto privcheck;
 5382         }
 5383 
 5384         /* Otherwise, check everyone else. */
 5385         if (file_mode & S_IXOTH)
 5386                 dac_granted |= VEXEC;
 5387         if (file_mode & S_IROTH)
 5388                 dac_granted |= VREAD;
 5389         if (file_mode & S_IWOTH)
 5390                 dac_granted |= (VWRITE | VAPPEND);
 5391         if ((accmode & dac_granted) == accmode)
 5392                 return (0);
 5393 
 5394 privcheck:
 5395         /*
 5396          * Build a privilege mask to determine if the set of privileges
 5397          * satisfies the requirements when combined with the granted mask
 5398          * from above.  For each privilege, if the privilege is required,
 5399          * bitwise or the request type onto the priv_granted mask.
 5400          */
 5401         priv_granted = 0;
 5402 
 5403         if (type == VDIR) {
 5404                 /*
 5405                  * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 5406                  * requests, instead of PRIV_VFS_EXEC.
 5407                  */
 5408                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 5409                     !priv_check_cred(cred, PRIV_VFS_LOOKUP))
 5410                         priv_granted |= VEXEC;
 5411         } else {
 5412                 /*
 5413                  * Ensure that at least one execute bit is on. Otherwise,
 5414                  * a privileged user will always succeed, and we don't want
 5415                  * this to happen unless the file really is executable.
 5416                  */
 5417                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 5418                     (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 5419                     !priv_check_cred(cred, PRIV_VFS_EXEC))
 5420                         priv_granted |= VEXEC;
 5421         }
 5422 
 5423         if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 5424             !priv_check_cred(cred, PRIV_VFS_READ))
 5425                 priv_granted |= VREAD;
 5426 
 5427         if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 5428             !priv_check_cred(cred, PRIV_VFS_WRITE))
 5429                 priv_granted |= (VWRITE | VAPPEND);
 5430 
 5431         if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 5432             !priv_check_cred(cred, PRIV_VFS_ADMIN))
 5433                 priv_granted |= VADMIN;
 5434 
 5435         if ((accmode & (priv_granted | dac_granted)) == accmode) {
 5436                 return (0);
 5437         }
 5438 
 5439         return ((accmode & VADMIN) ? EPERM : EACCES);
 5440 }
 5441 
 5442 /*
 5443  * Credential check based on process requesting service, and per-attribute
 5444  * permissions.
 5445  */
 5446 int
 5447 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
 5448     struct thread *td, accmode_t accmode)
 5449 {
 5450 
 5451         /*
 5452          * Kernel-invoked always succeeds.
 5453          */
 5454         if (cred == NOCRED)
 5455                 return (0);
 5456 
 5457         /*
 5458          * Do not allow privileged processes in jail to directly manipulate
 5459          * system attributes.
 5460          */
 5461         switch (attrnamespace) {
 5462         case EXTATTR_NAMESPACE_SYSTEM:
 5463                 /* Potentially should be: return (EPERM); */
 5464                 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 5465         case EXTATTR_NAMESPACE_USER:
 5466                 return (VOP_ACCESS(vp, accmode, cred, td));
 5467         default:
 5468                 return (EPERM);
 5469         }
 5470 }
 5471 
 5472 #ifdef DEBUG_VFS_LOCKS
 5473 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
 5474 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
 5475     "Drop into debugger on lock violation");
 5476 
 5477 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
 5478 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
 5479     0, "Check for interlock across VOPs");
 5480 
 5481 int vfs_badlock_print = 1;      /* Print lock violations. */
 5482 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
 5483     0, "Print lock violations");
 5484 
 5485 int vfs_badlock_vnode = 1;      /* Print vnode details on lock violations. */
 5486 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
 5487     0, "Print vnode details on lock violations");
 5488 
 5489 #ifdef KDB
 5490 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
 5491 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
 5492     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 5493 #endif
 5494 
 5495 static void
 5496 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 5497 {
 5498 
 5499 #ifdef KDB
 5500         if (vfs_badlock_backtrace)
 5501                 kdb_backtrace();
 5502 #endif
 5503         if (vfs_badlock_vnode)
 5504                 vn_printf(vp, "vnode ");
 5505         if (vfs_badlock_print)
 5506                 printf("%s: %p %s\n", str, (void *)vp, msg);
 5507         if (vfs_badlock_ddb)
 5508                 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 5509 }
 5510 
 5511 void
 5512 assert_vi_locked(struct vnode *vp, const char *str)
 5513 {
 5514 
 5515         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 5516                 vfs_badlock("interlock is not locked but should be", str, vp);
 5517 }
 5518 
 5519 void
 5520 assert_vi_unlocked(struct vnode *vp, const char *str)
 5521 {
 5522 
 5523         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 5524                 vfs_badlock("interlock is locked but should not be", str, vp);
 5525 }
 5526 
 5527 void
 5528 assert_vop_locked(struct vnode *vp, const char *str)
 5529 {
 5530         int locked;
 5531 
 5532         if (KERNEL_PANICKED() || vp == NULL)
 5533                 return;
 5534 
 5535         locked = VOP_ISLOCKED(vp);
 5536         if (locked == 0 || locked == LK_EXCLOTHER)
 5537                 vfs_badlock("is not locked but should be", str, vp);
 5538 }
 5539 
 5540 void
 5541 assert_vop_unlocked(struct vnode *vp, const char *str)
 5542 {
 5543         if (KERNEL_PANICKED() || vp == NULL)
 5544                 return;
 5545 
 5546         if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 5547                 vfs_badlock("is locked but should not be", str, vp);
 5548 }
 5549 
 5550 void
 5551 assert_vop_elocked(struct vnode *vp, const char *str)
 5552 {
 5553         if (KERNEL_PANICKED() || vp == NULL)
 5554                 return;
 5555 
 5556         if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 5557                 vfs_badlock("is not exclusive locked but should be", str, vp);
 5558 }
 5559 #endif /* DEBUG_VFS_LOCKS */
 5560 
 5561 void
 5562 vop_rename_fail(struct vop_rename_args *ap)
 5563 {
 5564 
 5565         if (ap->a_tvp != NULL)
 5566                 vput(ap->a_tvp);
 5567         if (ap->a_tdvp == ap->a_tvp)
 5568                 vrele(ap->a_tdvp);
 5569         else
 5570                 vput(ap->a_tdvp);
 5571         vrele(ap->a_fdvp);
 5572         vrele(ap->a_fvp);
 5573 }
 5574 
 5575 void
 5576 vop_rename_pre(void *ap)
 5577 {
 5578         struct vop_rename_args *a = ap;
 5579 
 5580 #ifdef DEBUG_VFS_LOCKS
 5581         if (a->a_tvp)
 5582                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 5583         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 5584         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 5585         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 5586 
 5587         /* Check the source (from). */
 5588         if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 5589             (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 5590                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 5591         if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 5592                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 5593 
 5594         /* Check the target. */
 5595         if (a->a_tvp)
 5596                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 5597         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 5598 #endif
 5599         /*
 5600          * It may be tempting to add vn_seqc_write_begin/end calls here and
 5601          * in vop_rename_post but that's not going to work out since some
 5602          * filesystems relookup vnodes mid-rename. This is probably a bug.
 5603          *
 5604          * For now filesystems are expected to do the relevant calls after they
 5605          * decide what vnodes to operate on.
 5606          */
 5607         if (a->a_tdvp != a->a_fdvp)
 5608                 vhold(a->a_fdvp);
 5609         if (a->a_tvp != a->a_fvp)
 5610                 vhold(a->a_fvp);
 5611         vhold(a->a_tdvp);
 5612         if (a->a_tvp)
 5613                 vhold(a->a_tvp);
 5614 }
 5615 
 5616 #ifdef DEBUG_VFS_LOCKS
 5617 void
 5618 vop_fplookup_vexec_debugpre(void *ap __unused)
 5619 {
 5620 
 5621         VFS_SMR_ASSERT_ENTERED();
 5622 }
 5623 
 5624 void
 5625 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
 5626 {
 5627 
 5628         VFS_SMR_ASSERT_ENTERED();
 5629 }
 5630 
 5631 void
 5632 vop_fplookup_symlink_debugpre(void *ap __unused)
 5633 {
 5634 
 5635         VFS_SMR_ASSERT_ENTERED();
 5636 }
 5637 
 5638 void
 5639 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused)
 5640 {
 5641 
 5642         VFS_SMR_ASSERT_ENTERED();
 5643 }
 5644 
 5645 static void
 5646 vop_fsync_debugprepost(struct vnode *vp, const char *name)
 5647 {
 5648         if (vp->v_type == VCHR)
 5649                 ;
 5650         else if (MNT_EXTENDED_SHARED(vp->v_mount))
 5651                 ASSERT_VOP_LOCKED(vp, name);
 5652         else
 5653                 ASSERT_VOP_ELOCKED(vp, name);
 5654 }
 5655 
 5656 void
 5657 vop_fsync_debugpre(void *a)
 5658 {
 5659         struct vop_fsync_args *ap;
 5660 
 5661         ap = a;
 5662         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5663 }
 5664 
 5665 void
 5666 vop_fsync_debugpost(void *a, int rc __unused)
 5667 {
 5668         struct vop_fsync_args *ap;
 5669 
 5670         ap = a;
 5671         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5672 }
 5673 
 5674 void
 5675 vop_fdatasync_debugpre(void *a)
 5676 {
 5677         struct vop_fdatasync_args *ap;
 5678 
 5679         ap = a;
 5680         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5681 }
 5682 
 5683 void
 5684 vop_fdatasync_debugpost(void *a, int rc __unused)
 5685 {
 5686         struct vop_fdatasync_args *ap;
 5687 
 5688         ap = a;
 5689         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5690 }
 5691 
 5692 void
 5693 vop_strategy_debugpre(void *ap)
 5694 {
 5695         struct vop_strategy_args *a;
 5696         struct buf *bp;
 5697 
 5698         a = ap;
 5699         bp = a->a_bp;
 5700 
 5701         /*
 5702          * Cluster ops lock their component buffers but not the IO container.
 5703          */
 5704         if ((bp->b_flags & B_CLUSTER) != 0)
 5705                 return;
 5706 
 5707         if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
 5708                 if (vfs_badlock_print)
 5709                         printf(
 5710                             "VOP_STRATEGY: bp is not locked but should be\n");
 5711                 if (vfs_badlock_ddb)
 5712                         kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 5713         }
 5714 }
 5715 
 5716 void
 5717 vop_lock_debugpre(void *ap)
 5718 {
 5719         struct vop_lock1_args *a = ap;
 5720 
 5721         if ((a->a_flags & LK_INTERLOCK) == 0)
 5722                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 5723         else
 5724                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 5725 }
 5726 
 5727 void
 5728 vop_lock_debugpost(void *ap, int rc)
 5729 {
 5730         struct vop_lock1_args *a = ap;
 5731 
 5732         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 5733         if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 5734                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 5735 }
 5736 
 5737 void
 5738 vop_unlock_debugpre(void *ap)
 5739 {
 5740         struct vop_unlock_args *a = ap;
 5741 
 5742         ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 5743 }
 5744 
 5745 void
 5746 vop_need_inactive_debugpre(void *ap)
 5747 {
 5748         struct vop_need_inactive_args *a = ap;
 5749 
 5750         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 5751 }
 5752 
 5753 void
 5754 vop_need_inactive_debugpost(void *ap, int rc)
 5755 {
 5756         struct vop_need_inactive_args *a = ap;
 5757 
 5758         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 5759 }
 5760 #endif
 5761 
 5762 void
 5763 vop_create_pre(void *ap)
 5764 {
 5765         struct vop_create_args *a;
 5766         struct vnode *dvp;
 5767 
 5768         a = ap;
 5769         dvp = a->a_dvp;
 5770         vn_seqc_write_begin(dvp);
 5771 }
 5772 
 5773 void
 5774 vop_create_post(void *ap, int rc)
 5775 {
 5776         struct vop_create_args *a;
 5777         struct vnode *dvp;
 5778 
 5779         a = ap;
 5780         dvp = a->a_dvp;
 5781         vn_seqc_write_end(dvp);
 5782         if (!rc)
 5783                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 5784 }
 5785 
 5786 void
 5787 vop_whiteout_pre(void *ap)
 5788 {
 5789         struct vop_whiteout_args *a;
 5790         struct vnode *dvp;
 5791 
 5792         a = ap;
 5793         dvp = a->a_dvp;
 5794         vn_seqc_write_begin(dvp);
 5795 }
 5796 
 5797 void
 5798 vop_whiteout_post(void *ap, int rc)
 5799 {
 5800         struct vop_whiteout_args *a;
 5801         struct vnode *dvp;
 5802 
 5803         a = ap;
 5804         dvp = a->a_dvp;
 5805         vn_seqc_write_end(dvp);
 5806 }
 5807 
 5808 void
 5809 vop_deleteextattr_pre(void *ap)
 5810 {
 5811         struct vop_deleteextattr_args *a;
 5812         struct vnode *vp;
 5813 
 5814         a = ap;
 5815         vp = a->a_vp;
 5816         vn_seqc_write_begin(vp);
 5817 }
 5818 
 5819 void
 5820 vop_deleteextattr_post(void *ap, int rc)
 5821 {
 5822         struct vop_deleteextattr_args *a;
 5823         struct vnode *vp;
 5824 
 5825         a = ap;
 5826         vp = a->a_vp;
 5827         vn_seqc_write_end(vp);
 5828         if (!rc)
 5829                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 5830 }
 5831 
 5832 void
 5833 vop_link_pre(void *ap)
 5834 {
 5835         struct vop_link_args *a;
 5836         struct vnode *vp, *tdvp;
 5837 
 5838         a = ap;
 5839         vp = a->a_vp;
 5840         tdvp = a->a_tdvp;
 5841         vn_seqc_write_begin(vp);
 5842         vn_seqc_write_begin(tdvp);
 5843 }
 5844 
 5845 void
 5846 vop_link_post(void *ap, int rc)
 5847 {
 5848         struct vop_link_args *a;
 5849         struct vnode *vp, *tdvp;
 5850 
 5851         a = ap;
 5852         vp = a->a_vp;
 5853         tdvp = a->a_tdvp;
 5854         vn_seqc_write_end(vp);
 5855         vn_seqc_write_end(tdvp);
 5856         if (!rc) {
 5857                 VFS_KNOTE_LOCKED(vp, NOTE_LINK);
 5858                 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
 5859         }
 5860 }
 5861 
 5862 void
 5863 vop_mkdir_pre(void *ap)
 5864 {
 5865         struct vop_mkdir_args *a;
 5866         struct vnode *dvp;
 5867 
 5868         a = ap;
 5869         dvp = a->a_dvp;
 5870         vn_seqc_write_begin(dvp);
 5871 }
 5872 
 5873 void
 5874 vop_mkdir_post(void *ap, int rc)
 5875 {
 5876         struct vop_mkdir_args *a;
 5877         struct vnode *dvp;
 5878 
 5879         a = ap;
 5880         dvp = a->a_dvp;
 5881         vn_seqc_write_end(dvp);
 5882         if (!rc)
 5883                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 5884 }
 5885 
 5886 #ifdef DEBUG_VFS_LOCKS
 5887 void
 5888 vop_mkdir_debugpost(void *ap, int rc)
 5889 {
 5890         struct vop_mkdir_args *a;
 5891 
 5892         a = ap;
 5893         if (!rc)
 5894                 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp);
 5895 }
 5896 #endif
 5897 
 5898 void
 5899 vop_mknod_pre(void *ap)
 5900 {
 5901         struct vop_mknod_args *a;
 5902         struct vnode *dvp;
 5903 
 5904         a = ap;
 5905         dvp = a->a_dvp;
 5906         vn_seqc_write_begin(dvp);
 5907 }
 5908 
 5909 void
 5910 vop_mknod_post(void *ap, int rc)
 5911 {
 5912         struct vop_mknod_args *a;
 5913         struct vnode *dvp;
 5914 
 5915         a = ap;
 5916         dvp = a->a_dvp;
 5917         vn_seqc_write_end(dvp);
 5918         if (!rc)
 5919                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 5920 }
 5921 
 5922 void
 5923 vop_reclaim_post(void *ap, int rc)
 5924 {
 5925         struct vop_reclaim_args *a;
 5926         struct vnode *vp;
 5927 
 5928         a = ap;
 5929         vp = a->a_vp;
 5930         ASSERT_VOP_IN_SEQC(vp);
 5931         if (!rc)
 5932                 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
 5933 }
 5934 
 5935 void
 5936 vop_remove_pre(void *ap)
 5937 {
 5938         struct vop_remove_args *a;
 5939         struct vnode *dvp, *vp;
 5940 
 5941         a = ap;
 5942         dvp = a->a_dvp;
 5943         vp = a->a_vp;
 5944         vn_seqc_write_begin(dvp);
 5945         vn_seqc_write_begin(vp);
 5946 }
 5947 
 5948 void
 5949 vop_remove_post(void *ap, int rc)
 5950 {
 5951         struct vop_remove_args *a;
 5952         struct vnode *dvp, *vp;
 5953 
 5954         a = ap;
 5955         dvp = a->a_dvp;
 5956         vp = a->a_vp;
 5957         vn_seqc_write_end(dvp);
 5958         vn_seqc_write_end(vp);
 5959         if (!rc) {
 5960                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 5961                 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 5962         }
 5963 }
 5964 
 5965 void
 5966 vop_rename_post(void *ap, int rc)
 5967 {
 5968         struct vop_rename_args *a = ap;
 5969         long hint;
 5970 
 5971         if (!rc) {
 5972                 hint = NOTE_WRITE;
 5973                 if (a->a_fdvp == a->a_tdvp) {
 5974                         if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
 5975                                 hint |= NOTE_LINK;
 5976                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 5977                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 5978                 } else {
 5979                         hint |= NOTE_EXTEND;
 5980                         if (a->a_fvp->v_type == VDIR)
 5981                                 hint |= NOTE_LINK;
 5982                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 5983 
 5984                         if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
 5985                             a->a_tvp->v_type == VDIR)
 5986                                 hint &= ~NOTE_LINK;
 5987                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 5988                 }
 5989 
 5990                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 5991                 if (a->a_tvp)
 5992                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 5993         }
 5994         if (a->a_tdvp != a->a_fdvp)
 5995                 vdrop(a->a_fdvp);
 5996         if (a->a_tvp != a->a_fvp)
 5997                 vdrop(a->a_fvp);
 5998         vdrop(a->a_tdvp);
 5999         if (a->a_tvp)
 6000                 vdrop(a->a_tvp);
 6001 }
 6002 
 6003 void
 6004 vop_rmdir_pre(void *ap)
 6005 {
 6006         struct vop_rmdir_args *a;
 6007         struct vnode *dvp, *vp;
 6008 
 6009         a = ap;
 6010         dvp = a->a_dvp;
 6011         vp = a->a_vp;
 6012         vn_seqc_write_begin(dvp);
 6013         vn_seqc_write_begin(vp);
 6014 }
 6015 
 6016 void
 6017 vop_rmdir_post(void *ap, int rc)
 6018 {
 6019         struct vop_rmdir_args *a;
 6020         struct vnode *dvp, *vp;
 6021 
 6022         a = ap;
 6023         dvp = a->a_dvp;
 6024         vp = a->a_vp;
 6025         vn_seqc_write_end(dvp);
 6026         vn_seqc_write_end(vp);
 6027         if (!rc) {
 6028                 vp->v_vflag |= VV_UNLINKED;
 6029                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 6030                 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 6031         }
 6032 }
 6033 
 6034 void
 6035 vop_setattr_pre(void *ap)
 6036 {
 6037         struct vop_setattr_args *a;
 6038         struct vnode *vp;
 6039 
 6040         a = ap;
 6041         vp = a->a_vp;
 6042         vn_seqc_write_begin(vp);
 6043 }
 6044 
 6045 void
 6046 vop_setattr_post(void *ap, int rc)
 6047 {
 6048         struct vop_setattr_args *a;
 6049         struct vnode *vp;
 6050 
 6051         a = ap;
 6052         vp = a->a_vp;
 6053         vn_seqc_write_end(vp);
 6054         if (!rc)
 6055                 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 6056 }
 6057 
 6058 void
 6059 vop_setacl_pre(void *ap)
 6060 {
 6061         struct vop_setacl_args *a;
 6062         struct vnode *vp;
 6063 
 6064         a = ap;
 6065         vp = a->a_vp;
 6066         vn_seqc_write_begin(vp);
 6067 }
 6068 
 6069 void
 6070 vop_setacl_post(void *ap, int rc __unused)
 6071 {
 6072         struct vop_setacl_args *a;
 6073         struct vnode *vp;
 6074 
 6075         a = ap;
 6076         vp = a->a_vp;
 6077         vn_seqc_write_end(vp);
 6078 }
 6079 
 6080 void
 6081 vop_setextattr_pre(void *ap)
 6082 {
 6083         struct vop_setextattr_args *a;
 6084         struct vnode *vp;
 6085 
 6086         a = ap;
 6087         vp = a->a_vp;
 6088         vn_seqc_write_begin(vp);
 6089 }
 6090 
 6091 void
 6092 vop_setextattr_post(void *ap, int rc)
 6093 {
 6094         struct vop_setextattr_args *a;
 6095         struct vnode *vp;
 6096 
 6097         a = ap;
 6098         vp = a->a_vp;
 6099         vn_seqc_write_end(vp);
 6100         if (!rc)
 6101                 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 6102 }
 6103 
 6104 void
 6105 vop_symlink_pre(void *ap)
 6106 {
 6107         struct vop_symlink_args *a;
 6108         struct vnode *dvp;
 6109 
 6110         a = ap;
 6111         dvp = a->a_dvp;
 6112         vn_seqc_write_begin(dvp);
 6113 }
 6114 
 6115 void
 6116 vop_symlink_post(void *ap, int rc)
 6117 {
 6118         struct vop_symlink_args *a;
 6119         struct vnode *dvp;
 6120 
 6121         a = ap;
 6122         dvp = a->a_dvp;
 6123         vn_seqc_write_end(dvp);
 6124         if (!rc)
 6125                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 6126 }
 6127 
 6128 void
 6129 vop_open_post(void *ap, int rc)
 6130 {
 6131         struct vop_open_args *a = ap;
 6132 
 6133         if (!rc)
 6134                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
 6135 }
 6136 
 6137 void
 6138 vop_close_post(void *ap, int rc)
 6139 {
 6140         struct vop_close_args *a = ap;
 6141 
 6142         if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
 6143             !VN_IS_DOOMED(a->a_vp))) {
 6144                 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 6145                     NOTE_CLOSE_WRITE : NOTE_CLOSE);
 6146         }
 6147 }
 6148 
 6149 void
 6150 vop_read_post(void *ap, int rc)
 6151 {
 6152         struct vop_read_args *a = ap;
 6153 
 6154         if (!rc)
 6155                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 6156 }
 6157 
 6158 void
 6159 vop_read_pgcache_post(void *ap, int rc)
 6160 {
 6161         struct vop_read_pgcache_args *a = ap;
 6162 
 6163         if (!rc)
 6164                 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
 6165 }
 6166 
 6167 void
 6168 vop_readdir_post(void *ap, int rc)
 6169 {
 6170         struct vop_readdir_args *a = ap;
 6171 
 6172         if (!rc)
 6173                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 6174 }
 6175 
 6176 static struct knlist fs_knlist;
 6177 
 6178 static void
 6179 vfs_event_init(void *arg)
 6180 {
 6181         knlist_init_mtx(&fs_knlist, NULL);
 6182 }
 6183 /* XXX - correct order? */
 6184 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 6185 
 6186 void
 6187 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 6188 {
 6189 
 6190         KNOTE_UNLOCKED(&fs_knlist, event);
 6191 }
 6192 
 6193 static int      filt_fsattach(struct knote *kn);
 6194 static void     filt_fsdetach(struct knote *kn);
 6195 static int      filt_fsevent(struct knote *kn, long hint);
 6196 
 6197 struct filterops fs_filtops = {
 6198         .f_isfd = 0,
 6199         .f_attach = filt_fsattach,
 6200         .f_detach = filt_fsdetach,
 6201         .f_event = filt_fsevent
 6202 };
 6203 
 6204 static int
 6205 filt_fsattach(struct knote *kn)
 6206 {
 6207 
 6208         kn->kn_flags |= EV_CLEAR;
 6209         knlist_add(&fs_knlist, kn, 0);
 6210         return (0);
 6211 }
 6212 
 6213 static void
 6214 filt_fsdetach(struct knote *kn)
 6215 {
 6216 
 6217         knlist_remove(&fs_knlist, kn, 0);
 6218 }
 6219 
 6220 static int
 6221 filt_fsevent(struct knote *kn, long hint)
 6222 {
 6223 
 6224         kn->kn_fflags |= hint;
 6225         return (kn->kn_fflags != 0);
 6226 }
 6227 
 6228 static int
 6229 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 6230 {
 6231         struct vfsidctl vc;
 6232         int error;
 6233         struct mount *mp;
 6234 
 6235         error = SYSCTL_IN(req, &vc, sizeof(vc));
 6236         if (error)
 6237                 return (error);
 6238         if (vc.vc_vers != VFS_CTL_VERS1)
 6239                 return (EINVAL);
 6240         mp = vfs_getvfs(&vc.vc_fsid);
 6241         if (mp == NULL)
 6242                 return (ENOENT);
 6243         /* ensure that a specific sysctl goes to the right filesystem. */
 6244         if (strcmp(vc.vc_fstypename, "*") != 0 &&
 6245             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 6246                 vfs_rel(mp);
 6247                 return (EINVAL);
 6248         }
 6249         VCTLTOREQ(&vc, req);
 6250         error = VFS_SYSCTL(mp, vc.vc_op, req);
 6251         vfs_rel(mp);
 6252         return (error);
 6253 }
 6254 
 6255 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
 6256     NULL, 0, sysctl_vfs_ctl, "",
 6257     "Sysctl by fsid");
 6258 
 6259 /*
 6260  * Function to initialize a va_filerev field sensibly.
 6261  * XXX: Wouldn't a random number make a lot more sense ??
 6262  */
 6263 u_quad_t
 6264 init_va_filerev(void)
 6265 {
 6266         struct bintime bt;
 6267 
 6268         getbinuptime(&bt);
 6269         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 6270 }
 6271 
 6272 static int      filt_vfsread(struct knote *kn, long hint);
 6273 static int      filt_vfswrite(struct knote *kn, long hint);
 6274 static int      filt_vfsvnode(struct knote *kn, long hint);
 6275 static void     filt_vfsdetach(struct knote *kn);
 6276 static struct filterops vfsread_filtops = {
 6277         .f_isfd = 1,
 6278         .f_detach = filt_vfsdetach,
 6279         .f_event = filt_vfsread
 6280 };
 6281 static struct filterops vfswrite_filtops = {
 6282         .f_isfd = 1,
 6283         .f_detach = filt_vfsdetach,
 6284         .f_event = filt_vfswrite
 6285 };
 6286 static struct filterops vfsvnode_filtops = {
 6287         .f_isfd = 1,
 6288         .f_detach = filt_vfsdetach,
 6289         .f_event = filt_vfsvnode
 6290 };
 6291 
 6292 static void
 6293 vfs_knllock(void *arg)
 6294 {
 6295         struct vnode *vp = arg;
 6296 
 6297         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 6298 }
 6299 
 6300 static void
 6301 vfs_knlunlock(void *arg)
 6302 {
 6303         struct vnode *vp = arg;
 6304 
 6305         VOP_UNLOCK(vp);
 6306 }
 6307 
 6308 static void
 6309 vfs_knl_assert_lock(void *arg, int what)
 6310 {
 6311 #ifdef DEBUG_VFS_LOCKS
 6312         struct vnode *vp = arg;
 6313 
 6314         if (what == LA_LOCKED)
 6315                 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 6316         else
 6317                 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 6318 #endif
 6319 }
 6320 
 6321 int
 6322 vfs_kqfilter(struct vop_kqfilter_args *ap)
 6323 {
 6324         struct vnode *vp = ap->a_vp;
 6325         struct knote *kn = ap->a_kn;
 6326         struct knlist *knl;
 6327 
 6328         switch (kn->kn_filter) {
 6329         case EVFILT_READ:
 6330                 kn->kn_fop = &vfsread_filtops;
 6331                 break;
 6332         case EVFILT_WRITE:
 6333                 kn->kn_fop = &vfswrite_filtops;
 6334                 break;
 6335         case EVFILT_VNODE:
 6336                 kn->kn_fop = &vfsvnode_filtops;
 6337                 break;
 6338         default:
 6339                 return (EINVAL);
 6340         }
 6341 
 6342         kn->kn_hook = (caddr_t)vp;
 6343 
 6344         v_addpollinfo(vp);
 6345         if (vp->v_pollinfo == NULL)
 6346                 return (ENOMEM);
 6347         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 6348         vhold(vp);
 6349         knlist_add(knl, kn, 0);
 6350 
 6351         return (0);
 6352 }
 6353 
 6354 /*
 6355  * Detach knote from vnode
 6356  */
 6357 static void
 6358 filt_vfsdetach(struct knote *kn)
 6359 {
 6360         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6361 
 6362         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 6363         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 6364         vdrop(vp);
 6365 }
 6366 
 6367 /*ARGSUSED*/
 6368 static int
 6369 filt_vfsread(struct knote *kn, long hint)
 6370 {
 6371         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6372         struct vattr va;
 6373         int res;
 6374 
 6375         /*
 6376          * filesystem is gone, so set the EOF flag and schedule
 6377          * the knote for deletion.
 6378          */
 6379         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 6380                 VI_LOCK(vp);
 6381                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 6382                 VI_UNLOCK(vp);
 6383                 return (1);
 6384         }
 6385 
 6386         if (VOP_GETATTR(vp, &va, curthread->td_ucred))
 6387                 return (0);
 6388 
 6389         VI_LOCK(vp);
 6390         kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 6391         res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
 6392         VI_UNLOCK(vp);
 6393         return (res);
 6394 }
 6395 
 6396 /*ARGSUSED*/
 6397 static int
 6398 filt_vfswrite(struct knote *kn, long hint)
 6399 {
 6400         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6401 
 6402         VI_LOCK(vp);
 6403 
 6404         /*
 6405          * filesystem is gone, so set the EOF flag and schedule
 6406          * the knote for deletion.
 6407          */
 6408         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 6409                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 6410 
 6411         kn->kn_data = 0;
 6412         VI_UNLOCK(vp);
 6413         return (1);
 6414 }
 6415 
 6416 static int
 6417 filt_vfsvnode(struct knote *kn, long hint)
 6418 {
 6419         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6420         int res;
 6421 
 6422         VI_LOCK(vp);
 6423         if (kn->kn_sfflags & hint)
 6424                 kn->kn_fflags |= hint;
 6425         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 6426                 kn->kn_flags |= EV_EOF;
 6427                 VI_UNLOCK(vp);
 6428                 return (1);
 6429         }
 6430         res = (kn->kn_fflags != 0);
 6431         VI_UNLOCK(vp);
 6432         return (res);
 6433 }
 6434 
 6435 /*
 6436  * Returns whether the directory is empty or not.
 6437  * If it is empty, the return value is 0; otherwise
 6438  * the return value is an error value (which may
 6439  * be ENOTEMPTY).
 6440  */
 6441 int
 6442 vfs_emptydir(struct vnode *vp)
 6443 {
 6444         struct uio uio;
 6445         struct iovec iov;
 6446         struct dirent *dirent, *dp, *endp;
 6447         int error, eof;
 6448 
 6449         error = 0;
 6450         eof = 0;
 6451 
 6452         ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
 6453         VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory"));
 6454 
 6455         dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
 6456         iov.iov_base = dirent;
 6457         iov.iov_len = sizeof(struct dirent);
 6458 
 6459         uio.uio_iov = &iov;
 6460         uio.uio_iovcnt = 1;
 6461         uio.uio_offset = 0;
 6462         uio.uio_resid = sizeof(struct dirent);
 6463         uio.uio_segflg = UIO_SYSSPACE;
 6464         uio.uio_rw = UIO_READ;
 6465         uio.uio_td = curthread;
 6466 
 6467         while (eof == 0 && error == 0) {
 6468                 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
 6469                     NULL, NULL);
 6470                 if (error != 0)
 6471                         break;
 6472                 endp = (void *)((uint8_t *)dirent +
 6473                     sizeof(struct dirent) - uio.uio_resid);
 6474                 for (dp = dirent; dp < endp;
 6475                      dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
 6476                         if (dp->d_type == DT_WHT)
 6477                                 continue;
 6478                         if (dp->d_namlen == 0)
 6479                                 continue;
 6480                         if (dp->d_type != DT_DIR &&
 6481                             dp->d_type != DT_UNKNOWN) {
 6482                                 error = ENOTEMPTY;
 6483                                 break;
 6484                         }
 6485                         if (dp->d_namlen > 2) {
 6486                                 error = ENOTEMPTY;
 6487                                 break;
 6488                         }
 6489                         if (dp->d_namlen == 1 &&
 6490                             dp->d_name[0] != '.') {
 6491                                 error = ENOTEMPTY;
 6492                                 break;
 6493                         }
 6494                         if (dp->d_namlen == 2 &&
 6495                             dp->d_name[1] != '.') {
 6496                                 error = ENOTEMPTY;
 6497                                 break;
 6498                         }
 6499                         uio.uio_resid = sizeof(struct dirent);
 6500                 }
 6501         }
 6502         free(dirent, M_TEMP);
 6503         return (error);
 6504 }
 6505 
 6506 int
 6507 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 6508 {
 6509         int error;
 6510 
 6511         if (dp->d_reclen > ap->a_uio->uio_resid)
 6512                 return (ENAMETOOLONG);
 6513         error = uiomove(dp, dp->d_reclen, ap->a_uio);
 6514         if (error) {
 6515                 if (ap->a_ncookies != NULL) {
 6516                         if (ap->a_cookies != NULL)
 6517                                 free(ap->a_cookies, M_TEMP);
 6518                         ap->a_cookies = NULL;
 6519                         *ap->a_ncookies = 0;
 6520                 }
 6521                 return (error);
 6522         }
 6523         if (ap->a_ncookies == NULL)
 6524                 return (0);
 6525 
 6526         KASSERT(ap->a_cookies,
 6527             ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 6528 
 6529         *ap->a_cookies = realloc(*ap->a_cookies,
 6530             (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 6531         (*ap->a_cookies)[*ap->a_ncookies] = off;
 6532         *ap->a_ncookies += 1;
 6533         return (0);
 6534 }
 6535 
 6536 /*
 6537  * The purpose of this routine is to remove granularity from accmode_t,
 6538  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
 6539  * VADMIN and VAPPEND.
 6540  *
 6541  * If it returns 0, the caller is supposed to continue with the usual
 6542  * access checks using 'accmode' as modified by this routine.  If it
 6543  * returns nonzero value, the caller is supposed to return that value
 6544  * as errno.
 6545  *
 6546  * Note that after this routine runs, accmode may be zero.
 6547  */
 6548 int
 6549 vfs_unixify_accmode(accmode_t *accmode)
 6550 {
 6551         /*
 6552          * There is no way to specify explicit "deny" rule using
 6553          * file mode or POSIX.1e ACLs.
 6554          */
 6555         if (*accmode & VEXPLICIT_DENY) {
 6556                 *accmode = 0;
 6557                 return (0);
 6558         }
 6559 
 6560         /*
 6561          * None of these can be translated into usual access bits.
 6562          * Also, the common case for NFSv4 ACLs is to not contain
 6563          * either of these bits. Caller should check for VWRITE
 6564          * on the containing directory instead.
 6565          */
 6566         if (*accmode & (VDELETE_CHILD | VDELETE))
 6567                 return (EPERM);
 6568 
 6569         if (*accmode & VADMIN_PERMS) {
 6570                 *accmode &= ~VADMIN_PERMS;
 6571                 *accmode |= VADMIN;
 6572         }
 6573 
 6574         /*
 6575          * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 6576          * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 6577          */
 6578         *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 6579 
 6580         return (0);
 6581 }
 6582 
 6583 /*
 6584  * Clear out a doomed vnode (if any) and replace it with a new one as long
 6585  * as the fs is not being unmounted. Return the root vnode to the caller.
 6586  */
 6587 static int __noinline
 6588 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
 6589 {
 6590         struct vnode *vp;
 6591         int error;
 6592 
 6593 restart:
 6594         if (mp->mnt_rootvnode != NULL) {
 6595                 MNT_ILOCK(mp);
 6596                 vp = mp->mnt_rootvnode;
 6597                 if (vp != NULL) {
 6598                         if (!VN_IS_DOOMED(vp)) {
 6599                                 vrefact(vp);
 6600                                 MNT_IUNLOCK(mp);
 6601                                 error = vn_lock(vp, flags);
 6602                                 if (error == 0) {
 6603                                         *vpp = vp;
 6604                                         return (0);
 6605                                 }
 6606                                 vrele(vp);
 6607                                 goto restart;
 6608                         }
 6609                         /*
 6610                          * Clear the old one.
 6611                          */
 6612                         mp->mnt_rootvnode = NULL;
 6613                 }
 6614                 MNT_IUNLOCK(mp);
 6615                 if (vp != NULL) {
 6616                         vfs_op_barrier_wait(mp);
 6617                         vrele(vp);
 6618                 }
 6619         }
 6620         error = VFS_CACHEDROOT(mp, flags, vpp);
 6621         if (error != 0)
 6622                 return (error);
 6623         if (mp->mnt_vfs_ops == 0) {
 6624                 MNT_ILOCK(mp);
 6625                 if (mp->mnt_vfs_ops != 0) {
 6626                         MNT_IUNLOCK(mp);
 6627                         return (0);
 6628                 }
 6629                 if (mp->mnt_rootvnode == NULL) {
 6630                         vrefact(*vpp);
 6631                         mp->mnt_rootvnode = *vpp;
 6632                 } else {
 6633                         if (mp->mnt_rootvnode != *vpp) {
 6634                                 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
 6635                                         panic("%s: mismatch between vnode returned "
 6636                                             " by VFS_CACHEDROOT and the one cached "
 6637                                             " (%p != %p)",
 6638                                             __func__, *vpp, mp->mnt_rootvnode);
 6639                                 }
 6640                         }
 6641                 }
 6642                 MNT_IUNLOCK(mp);
 6643         }
 6644         return (0);
 6645 }
 6646 
 6647 int
 6648 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
 6649 {
 6650         struct mount_pcpu *mpcpu;
 6651         struct vnode *vp;
 6652         int error;
 6653 
 6654         if (!vfs_op_thread_enter(mp, mpcpu))
 6655                 return (vfs_cache_root_fallback(mp, flags, vpp));
 6656         vp = atomic_load_ptr(&mp->mnt_rootvnode);
 6657         if (vp == NULL || VN_IS_DOOMED(vp)) {
 6658                 vfs_op_thread_exit(mp, mpcpu);
 6659                 return (vfs_cache_root_fallback(mp, flags, vpp));
 6660         }
 6661         vrefact(vp);
 6662         vfs_op_thread_exit(mp, mpcpu);
 6663         error = vn_lock(vp, flags);
 6664         if (error != 0) {
 6665                 vrele(vp);
 6666                 return (vfs_cache_root_fallback(mp, flags, vpp));
 6667         }
 6668         *vpp = vp;
 6669         return (0);
 6670 }
 6671 
 6672 struct vnode *
 6673 vfs_cache_root_clear(struct mount *mp)
 6674 {
 6675         struct vnode *vp;
 6676 
 6677         /*
 6678          * ops > 0 guarantees there is nobody who can see this vnode
 6679          */
 6680         MPASS(mp->mnt_vfs_ops > 0);
 6681         vp = mp->mnt_rootvnode;
 6682         if (vp != NULL)
 6683                 vn_seqc_write_begin(vp);
 6684         mp->mnt_rootvnode = NULL;
 6685         return (vp);
 6686 }
 6687 
 6688 void
 6689 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
 6690 {
 6691 
 6692         MPASS(mp->mnt_vfs_ops > 0);
 6693         vrefact(vp);
 6694         mp->mnt_rootvnode = vp;
 6695 }
 6696 
 6697 /*
 6698  * These are helper functions for filesystems to traverse all
 6699  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
 6700  *
 6701  * This interface replaces MNT_VNODE_FOREACH.
 6702  */
 6703 
 6704 struct vnode *
 6705 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 6706 {
 6707         struct vnode *vp;
 6708 
 6709         if (should_yield())
 6710                 kern_yield(PRI_USER);
 6711         MNT_ILOCK(mp);
 6712         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6713         for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
 6714             vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
 6715                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 6716                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 6717                         continue;
 6718                 VI_LOCK(vp);
 6719                 if (VN_IS_DOOMED(vp)) {
 6720                         VI_UNLOCK(vp);
 6721                         continue;
 6722                 }
 6723                 break;
 6724         }
 6725         if (vp == NULL) {
 6726                 __mnt_vnode_markerfree_all(mvp, mp);
 6727                 /* MNT_IUNLOCK(mp); -- done in above function */
 6728                 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 6729                 return (NULL);
 6730         }
 6731         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 6732         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 6733         MNT_IUNLOCK(mp);
 6734         return (vp);
 6735 }
 6736 
 6737 struct vnode *
 6738 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 6739 {
 6740         struct vnode *vp;
 6741 
 6742         *mvp = vn_alloc_marker(mp);
 6743         MNT_ILOCK(mp);
 6744         MNT_REF(mp);
 6745 
 6746         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 6747                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 6748                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 6749                         continue;
 6750                 VI_LOCK(vp);
 6751                 if (VN_IS_DOOMED(vp)) {
 6752                         VI_UNLOCK(vp);
 6753                         continue;
 6754                 }
 6755                 break;
 6756         }
 6757         if (vp == NULL) {
 6758                 MNT_REL(mp);
 6759                 MNT_IUNLOCK(mp);
 6760                 vn_free_marker(*mvp);
 6761                 *mvp = NULL;
 6762                 return (NULL);
 6763         }
 6764         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 6765         MNT_IUNLOCK(mp);
 6766         return (vp);
 6767 }
 6768 
 6769 void
 6770 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 6771 {
 6772 
 6773         if (*mvp == NULL) {
 6774                 MNT_IUNLOCK(mp);
 6775                 return;
 6776         }
 6777 
 6778         mtx_assert(MNT_MTX(mp), MA_OWNED);
 6779 
 6780         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6781         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 6782         MNT_REL(mp);
 6783         MNT_IUNLOCK(mp);
 6784         vn_free_marker(*mvp);
 6785         *mvp = NULL;
 6786 }
 6787 
 6788 /*
 6789  * These are helper functions for filesystems to traverse their
 6790  * lazy vnodes.  See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
 6791  */
 6792 static void
 6793 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 6794 {
 6795 
 6796         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6797 
 6798         MNT_ILOCK(mp);
 6799         MNT_REL(mp);
 6800         MNT_IUNLOCK(mp);
 6801         vn_free_marker(*mvp);
 6802         *mvp = NULL;
 6803 }
 6804 
 6805 /*
 6806  * Relock the mp mount vnode list lock with the vp vnode interlock in the
 6807  * conventional lock order during mnt_vnode_next_lazy iteration.
 6808  *
 6809  * On entry, the mount vnode list lock is held and the vnode interlock is not.
 6810  * The list lock is dropped and reacquired.  On success, both locks are held.
 6811  * On failure, the mount vnode list lock is held but the vnode interlock is
 6812  * not, and the procedure may have yielded.
 6813  */
 6814 static bool
 6815 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
 6816     struct vnode *vp)
 6817 {
 6818 
 6819         VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
 6820             TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
 6821             ("%s: bad marker", __func__));
 6822         VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
 6823             ("%s: inappropriate vnode", __func__));
 6824         ASSERT_VI_UNLOCKED(vp, __func__);
 6825         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 6826 
 6827         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
 6828         TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
 6829 
 6830         /*
 6831          * Note we may be racing against vdrop which transitioned the hold
 6832          * count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
 6833          * if we are the only user after we get the interlock we will just
 6834          * vdrop.
 6835          */
 6836         vhold(vp);
 6837         mtx_unlock(&mp->mnt_listmtx);
 6838         VI_LOCK(vp);
 6839         if (VN_IS_DOOMED(vp)) {
 6840                 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 6841                 goto out_lost;
 6842         }
 6843         VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 6844         /*
 6845          * There is nothing to do if we are the last user.
 6846          */
 6847         if (!refcount_release_if_not_last(&vp->v_holdcnt))
 6848                 goto out_lost;
 6849         mtx_lock(&mp->mnt_listmtx);
 6850         return (true);
 6851 out_lost:
 6852         vdropl(vp);
 6853         maybe_yield();
 6854         mtx_lock(&mp->mnt_listmtx);
 6855         return (false);
 6856 }
 6857 
 6858 static struct vnode *
 6859 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
 6860     void *cbarg)
 6861 {
 6862         struct vnode *vp;
 6863 
 6864         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 6865         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6866 restart:
 6867         vp = TAILQ_NEXT(*mvp, v_lazylist);
 6868         while (vp != NULL) {
 6869                 if (vp->v_type == VMARKER) {
 6870                         vp = TAILQ_NEXT(vp, v_lazylist);
 6871                         continue;
 6872                 }
 6873                 /*
 6874                  * See if we want to process the vnode. Note we may encounter a
 6875                  * long string of vnodes we don't care about and hog the list
 6876                  * as a result. Check for it and requeue the marker.
 6877                  */
 6878                 VNPASS(!VN_IS_DOOMED(vp), vp);
 6879                 if (!cb(vp, cbarg)) {
 6880                         if (!should_yield()) {
 6881                                 vp = TAILQ_NEXT(vp, v_lazylist);
 6882                                 continue;
 6883                         }
 6884                         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
 6885                             v_lazylist);
 6886                         TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
 6887                             v_lazylist);
 6888                         mtx_unlock(&mp->mnt_listmtx);
 6889                         kern_yield(PRI_USER);
 6890                         mtx_lock(&mp->mnt_listmtx);
 6891                         goto restart;
 6892                 }
 6893                 /*
 6894                  * Try-lock because this is the wrong lock order.
 6895                  */
 6896                 if (!VI_TRYLOCK(vp) &&
 6897                     !mnt_vnode_next_lazy_relock(*mvp, mp, vp))
 6898                         goto restart;
 6899                 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 6900                 KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 6901                     ("alien vnode on the lazy list %p %p", vp, mp));
 6902                 VNPASS(vp->v_mount == mp, vp);
 6903                 VNPASS(!VN_IS_DOOMED(vp), vp);
 6904                 break;
 6905         }
 6906         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 6907 
 6908         /* Check if we are done */
 6909         if (vp == NULL) {
 6910                 mtx_unlock(&mp->mnt_listmtx);
 6911                 mnt_vnode_markerfree_lazy(mvp, mp);
 6912                 return (NULL);
 6913         }
 6914         TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
 6915         mtx_unlock(&mp->mnt_listmtx);
 6916         ASSERT_VI_LOCKED(vp, "lazy iter");
 6917         return (vp);
 6918 }
 6919 
 6920 struct vnode *
 6921 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
 6922     void *cbarg)
 6923 {
 6924 
 6925         if (should_yield())
 6926                 kern_yield(PRI_USER);
 6927         mtx_lock(&mp->mnt_listmtx);
 6928         return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 6929 }
 6930 
 6931 struct vnode *
 6932 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
 6933     void *cbarg)
 6934 {
 6935         struct vnode *vp;
 6936 
 6937         if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
 6938                 return (NULL);
 6939 
 6940         *mvp = vn_alloc_marker(mp);
 6941         MNT_ILOCK(mp);
 6942         MNT_REF(mp);
 6943         MNT_IUNLOCK(mp);
 6944 
 6945         mtx_lock(&mp->mnt_listmtx);
 6946         vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
 6947         if (vp == NULL) {
 6948                 mtx_unlock(&mp->mnt_listmtx);
 6949                 mnt_vnode_markerfree_lazy(mvp, mp);
 6950                 return (NULL);
 6951         }
 6952         TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
 6953         return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 6954 }
 6955 
 6956 void
 6957 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 6958 {
 6959 
 6960         if (*mvp == NULL)
 6961                 return;
 6962 
 6963         mtx_lock(&mp->mnt_listmtx);
 6964         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 6965         mtx_unlock(&mp->mnt_listmtx);
 6966         mnt_vnode_markerfree_lazy(mvp, mp);
 6967 }
 6968 
 6969 int
 6970 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
 6971 {
 6972 
 6973         if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 6974                 cnp->cn_flags &= ~NOEXECCHECK;
 6975                 return (0);
 6976         }
 6977 
 6978         return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
 6979 }
 6980 
 6981 /*
 6982  * Do not use this variant unless you have means other than the hold count
 6983  * to prevent the vnode from getting freed.
 6984  */
 6985 void
 6986 vn_seqc_write_begin_locked(struct vnode *vp)
 6987 {
 6988 
 6989         ASSERT_VI_LOCKED(vp, __func__);
 6990         VNPASS(vp->v_holdcnt > 0, vp);
 6991         VNPASS(vp->v_seqc_users >= 0, vp);
 6992         vp->v_seqc_users++;
 6993         if (vp->v_seqc_users == 1)
 6994                 seqc_sleepable_write_begin(&vp->v_seqc);
 6995 }
 6996 
 6997 void
 6998 vn_seqc_write_begin(struct vnode *vp)
 6999 {
 7000 
 7001         VI_LOCK(vp);
 7002         vn_seqc_write_begin_locked(vp);
 7003         VI_UNLOCK(vp);
 7004 }
 7005 
 7006 void
 7007 vn_seqc_write_end_locked(struct vnode *vp)
 7008 {
 7009 
 7010         ASSERT_VI_LOCKED(vp, __func__);
 7011         VNPASS(vp->v_seqc_users > 0, vp);
 7012         vp->v_seqc_users--;
 7013         if (vp->v_seqc_users == 0)
 7014                 seqc_sleepable_write_end(&vp->v_seqc);
 7015 }
 7016 
 7017 void
 7018 vn_seqc_write_end(struct vnode *vp)
 7019 {
 7020 
 7021         VI_LOCK(vp);
 7022         vn_seqc_write_end_locked(vp);
 7023         VI_UNLOCK(vp);
 7024 }
 7025 
 7026 /*
 7027  * Special case handling for allocating and freeing vnodes.
 7028  *
 7029  * The counter remains unchanged on free so that a doomed vnode will
 7030  * keep testing as in modify as long as it is accessible with SMR.
 7031  */
 7032 static void
 7033 vn_seqc_init(struct vnode *vp)
 7034 {
 7035 
 7036         vp->v_seqc = 0;
 7037         vp->v_seqc_users = 0;
 7038 }
 7039 
 7040 static void
 7041 vn_seqc_write_end_free(struct vnode *vp)
 7042 {
 7043 
 7044         VNPASS(seqc_in_modify(vp->v_seqc), vp);
 7045         VNPASS(vp->v_seqc_users == 1, vp);
 7046 }
 7047 
 7048 void
 7049 vn_irflag_set_locked(struct vnode *vp, short toset)
 7050 {
 7051         short flags;
 7052 
 7053         ASSERT_VI_LOCKED(vp, __func__);
 7054         flags = vn_irflag_read(vp);
 7055         VNASSERT((flags & toset) == 0, vp,
 7056             ("%s: some of the passed flags already set (have %d, passed %d)\n",
 7057             __func__, flags, toset));
 7058         atomic_store_short(&vp->v_irflag, flags | toset);
 7059 }
 7060 
 7061 void
 7062 vn_irflag_set(struct vnode *vp, short toset)
 7063 {
 7064 
 7065         VI_LOCK(vp);
 7066         vn_irflag_set_locked(vp, toset);
 7067         VI_UNLOCK(vp);
 7068 }
 7069 
 7070 void
 7071 vn_irflag_set_cond_locked(struct vnode *vp, short toset)
 7072 {
 7073         short flags;
 7074 
 7075         ASSERT_VI_LOCKED(vp, __func__);
 7076         flags = vn_irflag_read(vp);
 7077         atomic_store_short(&vp->v_irflag, flags | toset);
 7078 }
 7079 
 7080 void
 7081 vn_irflag_set_cond(struct vnode *vp, short toset)
 7082 {
 7083 
 7084         VI_LOCK(vp);
 7085         vn_irflag_set_cond_locked(vp, toset);
 7086         VI_UNLOCK(vp);
 7087 }
 7088 
 7089 void
 7090 vn_irflag_unset_locked(struct vnode *vp, short tounset)
 7091 {
 7092         short flags;
 7093 
 7094         ASSERT_VI_LOCKED(vp, __func__);
 7095         flags = vn_irflag_read(vp);
 7096         VNASSERT((flags & tounset) == tounset, vp,
 7097             ("%s: some of the passed flags not set (have %d, passed %d)\n",
 7098             __func__, flags, tounset));
 7099         atomic_store_short(&vp->v_irflag, flags & ~tounset);
 7100 }
 7101 
 7102 void
 7103 vn_irflag_unset(struct vnode *vp, short tounset)
 7104 {
 7105 
 7106         VI_LOCK(vp);
 7107         vn_irflag_unset_locked(vp, tounset);
 7108         VI_UNLOCK(vp);
 7109 }

Cache object: c2161ba71ca3b620504b1b48abb90168


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.