The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_subr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
   37  */
   38 
   39 /*
   40  * External virtual filesystem routines
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD$");
   45 
   46 #include "opt_ddb.h"
   47 #include "opt_watchdog.h"
   48 
   49 #include <sys/param.h>
   50 #include <sys/systm.h>
   51 #include <sys/asan.h>
   52 #include <sys/bio.h>
   53 #include <sys/buf.h>
   54 #include <sys/capsicum.h>
   55 #include <sys/condvar.h>
   56 #include <sys/conf.h>
   57 #include <sys/counter.h>
   58 #include <sys/dirent.h>
   59 #include <sys/event.h>
   60 #include <sys/eventhandler.h>
   61 #include <sys/extattr.h>
   62 #include <sys/file.h>
   63 #include <sys/fcntl.h>
   64 #include <sys/jail.h>
   65 #include <sys/kdb.h>
   66 #include <sys/kernel.h>
   67 #include <sys/kthread.h>
   68 #include <sys/ktr.h>
   69 #include <sys/limits.h>
   70 #include <sys/lockf.h>
   71 #include <sys/malloc.h>
   72 #include <sys/mount.h>
   73 #include <sys/namei.h>
   74 #include <sys/pctrie.h>
   75 #include <sys/priv.h>
   76 #include <sys/reboot.h>
   77 #include <sys/refcount.h>
   78 #include <sys/rwlock.h>
   79 #include <sys/sched.h>
   80 #include <sys/sleepqueue.h>
   81 #include <sys/smr.h>
   82 #include <sys/smp.h>
   83 #include <sys/stat.h>
   84 #include <sys/sysctl.h>
   85 #include <sys/syslog.h>
   86 #include <sys/vmmeter.h>
   87 #include <sys/vnode.h>
   88 #include <sys/watchdog.h>
   89 
   90 #include <machine/stdarg.h>
   91 
   92 #include <security/mac/mac_framework.h>
   93 
   94 #include <vm/vm.h>
   95 #include <vm/vm_object.h>
   96 #include <vm/vm_extern.h>
   97 #include <vm/pmap.h>
   98 #include <vm/vm_map.h>
   99 #include <vm/vm_page.h>
  100 #include <vm/vm_kern.h>
  101 #include <vm/uma.h>
  102 
  103 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS))
  104 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS
  105 #endif
  106 
  107 #ifdef DDB
  108 #include <ddb/ddb.h>
  109 #endif
  110 
  111 static void     delmntque(struct vnode *vp);
  112 static int      flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
  113                     int slpflag, int slptimeo);
  114 static void     syncer_shutdown(void *arg, int howto);
  115 static int      vtryrecycle(struct vnode *vp);
  116 static void     v_init_counters(struct vnode *);
  117 static void     vn_seqc_init(struct vnode *);
  118 static void     vn_seqc_write_end_free(struct vnode *vp);
  119 static void     vgonel(struct vnode *);
  120 static bool     vhold_recycle_free(struct vnode *);
  121 static void     vdropl_recycle(struct vnode *vp);
  122 static void     vdrop_recycle(struct vnode *vp);
  123 static void     vfs_knllock(void *arg);
  124 static void     vfs_knlunlock(void *arg);
  125 static void     vfs_knl_assert_lock(void *arg, int what);
  126 static void     destroy_vpollinfo(struct vpollinfo *vi);
  127 static int      v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
  128                     daddr_t startlbn, daddr_t endlbn);
  129 static void     vnlru_recalc(void);
  130 
  131 /*
  132  * Number of vnodes in existence.  Increased whenever getnewvnode()
  133  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
  134  */
  135 static u_long __exclusive_cache_line numvnodes;
  136 
  137 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
  138     "Number of vnodes in existence");
  139 
  140 static counter_u64_t vnodes_created;
  141 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
  142     "Number of vnodes created by getnewvnode");
  143 
  144 /*
  145  * Conversion tables for conversion from vnode types to inode formats
  146  * and back.
  147  */
  148 enum vtype iftovt_tab[16] = {
  149         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
  150         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
  151 };
  152 int vttoif_tab[10] = {
  153         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
  154         S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
  155 };
  156 
  157 /*
  158  * List of allocates vnodes in the system.
  159  */
  160 static TAILQ_HEAD(freelst, vnode) vnode_list;
  161 static struct vnode *vnode_list_free_marker;
  162 static struct vnode *vnode_list_reclaim_marker;
  163 
  164 /*
  165  * "Free" vnode target.  Free vnodes are rarely completely free, but are
  166  * just ones that are cheap to recycle.  Usually they are for files which
  167  * have been stat'd but not read; these usually have inode and namecache
  168  * data attached to them.  This target is the preferred minimum size of a
  169  * sub-cache consisting mostly of such files. The system balances the size
  170  * of this sub-cache with its complement to try to prevent either from
  171  * thrashing while the other is relatively inactive.  The targets express
  172  * a preference for the best balance.
  173  *
  174  * "Above" this target there are 2 further targets (watermarks) related
  175  * to recyling of free vnodes.  In the best-operating case, the cache is
  176  * exactly full, the free list has size between vlowat and vhiwat above the
  177  * free target, and recycling from it and normal use maintains this state.
  178  * Sometimes the free list is below vlowat or even empty, but this state
  179  * is even better for immediate use provided the cache is not full.
  180  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
  181  * ones) to reach one of these states.  The watermarks are currently hard-
  182  * coded as 4% and 9% of the available space higher.  These and the default
  183  * of 25% for wantfreevnodes are too large if the memory size is large.
  184  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
  185  * whenever vnlru_proc() becomes active.
  186  */
  187 static long wantfreevnodes;
  188 static long __exclusive_cache_line freevnodes;
  189 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
  190     &freevnodes, 0, "Number of \"free\" vnodes");
  191 static long freevnodes_old;
  192 
  193 static counter_u64_t recycles_count;
  194 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
  195     "Number of vnodes recycled to meet vnode cache targets");
  196 
  197 static counter_u64_t recycles_free_count;
  198 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
  199     "Number of free vnodes recycled to meet vnode cache targets");
  200 
  201 static counter_u64_t deferred_inact;
  202 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
  203     "Number of times inactive processing was deferred");
  204 
  205 /* To keep more than one thread at a time from running vfs_getnewfsid */
  206 static struct mtx mntid_mtx;
  207 
  208 /*
  209  * Lock for any access to the following:
  210  *      vnode_list
  211  *      numvnodes
  212  *      freevnodes
  213  */
  214 static struct mtx __exclusive_cache_line vnode_list_mtx;
  215 
  216 /* Publicly exported FS */
  217 struct nfs_public nfs_pub;
  218 
  219 static uma_zone_t buf_trie_zone;
  220 static smr_t buf_trie_smr;
  221 
  222 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
  223 static uma_zone_t vnode_zone;
  224 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll");
  225 
  226 __read_frequently smr_t vfs_smr;
  227 
  228 /*
  229  * The workitem queue.
  230  *
  231  * It is useful to delay writes of file data and filesystem metadata
  232  * for tens of seconds so that quickly created and deleted files need
  233  * not waste disk bandwidth being created and removed. To realize this,
  234  * we append vnodes to a "workitem" queue. When running with a soft
  235  * updates implementation, most pending metadata dependencies should
  236  * not wait for more than a few seconds. Thus, mounted on block devices
  237  * are delayed only about a half the time that file data is delayed.
  238  * Similarly, directory updates are more critical, so are only delayed
  239  * about a third the time that file data is delayed. Thus, there are
  240  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  241  * one each second (driven off the filesystem syncer process). The
  242  * syncer_delayno variable indicates the next queue that is to be processed.
  243  * Items that need to be processed soon are placed in this queue:
  244  *
  245  *      syncer_workitem_pending[syncer_delayno]
  246  *
  247  * A delay of fifteen seconds is done by placing the request fifteen
  248  * entries later in the queue:
  249  *
  250  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  251  *
  252  */
  253 static int syncer_delayno;
  254 static long syncer_mask;
  255 LIST_HEAD(synclist, bufobj);
  256 static struct synclist *syncer_workitem_pending;
  257 /*
  258  * The sync_mtx protects:
  259  *      bo->bo_synclist
  260  *      sync_vnode_count
  261  *      syncer_delayno
  262  *      syncer_state
  263  *      syncer_workitem_pending
  264  *      syncer_worklist_len
  265  *      rushjob
  266  */
  267 static struct mtx sync_mtx;
  268 static struct cv sync_wakeup;
  269 
  270 #define SYNCER_MAXDELAY         32
  271 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
  272 static int syncdelay = 30;              /* max time to delay syncing data */
  273 static int filedelay = 30;              /* time to delay syncing files */
  274 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
  275     "Time to delay syncing files (in seconds)");
  276 static int dirdelay = 29;               /* time to delay syncing directories */
  277 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
  278     "Time to delay syncing directories (in seconds)");
  279 static int metadelay = 28;              /* time to delay syncing metadata */
  280 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
  281     "Time to delay syncing metadata (in seconds)");
  282 static int rushjob;             /* number of slots to run ASAP */
  283 static int stat_rush_requests;  /* number of times I/O speeded up */
  284 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
  285     "Number of times I/O speeded up (rush requests)");
  286 
  287 #define VDBATCH_SIZE 8
  288 struct vdbatch {
  289         u_int index;
  290         long freevnodes;
  291         struct mtx lock;
  292         struct vnode *tab[VDBATCH_SIZE];
  293 };
  294 DPCPU_DEFINE_STATIC(struct vdbatch, vd);
  295 
  296 static void     vdbatch_dequeue(struct vnode *vp);
  297 
  298 /*
  299  * When shutting down the syncer, run it at four times normal speed.
  300  */
  301 #define SYNCER_SHUTDOWN_SPEEDUP         4
  302 static int sync_vnode_count;
  303 static int syncer_worklist_len;
  304 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
  305     syncer_state;
  306 
  307 /* Target for maximum number of vnodes. */
  308 u_long desiredvnodes;
  309 static u_long gapvnodes;                /* gap between wanted and desired */
  310 static u_long vhiwat;           /* enough extras after expansion */
  311 static u_long vlowat;           /* minimal extras before expansion */
  312 static u_long vstir;            /* nonzero to stir non-free vnodes */
  313 static volatile int vsmalltrigger = 8;  /* pref to keep if > this many pages */
  314 
  315 static u_long vnlru_read_freevnodes(void);
  316 
  317 /*
  318  * Note that no attempt is made to sanitize these parameters.
  319  */
  320 static int
  321 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
  322 {
  323         u_long val;
  324         int error;
  325 
  326         val = desiredvnodes;
  327         error = sysctl_handle_long(oidp, &val, 0, req);
  328         if (error != 0 || req->newptr == NULL)
  329                 return (error);
  330 
  331         if (val == desiredvnodes)
  332                 return (0);
  333         mtx_lock(&vnode_list_mtx);
  334         desiredvnodes = val;
  335         wantfreevnodes = desiredvnodes / 4;
  336         vnlru_recalc();
  337         mtx_unlock(&vnode_list_mtx);
  338         /*
  339          * XXX There is no protection against multiple threads changing
  340          * desiredvnodes at the same time. Locking above only helps vnlru and
  341          * getnewvnode.
  342          */
  343         vfs_hash_changesize(desiredvnodes);
  344         cache_changesize(desiredvnodes);
  345         return (0);
  346 }
  347 
  348 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
  349     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
  350     "LU", "Target for maximum number of vnodes");
  351 
  352 static int
  353 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
  354 {
  355         u_long val;
  356         int error;
  357 
  358         val = wantfreevnodes;
  359         error = sysctl_handle_long(oidp, &val, 0, req);
  360         if (error != 0 || req->newptr == NULL)
  361                 return (error);
  362 
  363         if (val == wantfreevnodes)
  364                 return (0);
  365         mtx_lock(&vnode_list_mtx);
  366         wantfreevnodes = val;
  367         vnlru_recalc();
  368         mtx_unlock(&vnode_list_mtx);
  369         return (0);
  370 }
  371 
  372 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
  373     CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
  374     "LU", "Target for minimum number of \"free\" vnodes");
  375 
  376 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
  377     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
  378 static int vnlru_nowhere;
  379 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS,
  380     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
  381 
  382 static int
  383 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
  384 {
  385         struct vnode *vp;
  386         struct nameidata nd;
  387         char *buf;
  388         unsigned long ndflags;
  389         int error;
  390 
  391         if (req->newptr == NULL)
  392                 return (EINVAL);
  393         if (req->newlen >= PATH_MAX)
  394                 return (E2BIG);
  395 
  396         buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
  397         error = SYSCTL_IN(req, buf, req->newlen);
  398         if (error != 0)
  399                 goto out;
  400 
  401         buf[req->newlen] = '\0';
  402 
  403         ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1;
  404         NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf);
  405         if ((error = namei(&nd)) != 0)
  406                 goto out;
  407         vp = nd.ni_vp;
  408 
  409         if (VN_IS_DOOMED(vp)) {
  410                 /*
  411                  * This vnode is being recycled.  Return != 0 to let the caller
  412                  * know that the sysctl had no effect.  Return EAGAIN because a
  413                  * subsequent call will likely succeed (since namei will create
  414                  * a new vnode if necessary)
  415                  */
  416                 error = EAGAIN;
  417                 goto putvnode;
  418         }
  419 
  420         counter_u64_add(recycles_count, 1);
  421         vgone(vp);
  422 putvnode:
  423         vput(vp);
  424         NDFREE_PNBUF(&nd);
  425 out:
  426         free(buf, M_TEMP);
  427         return (error);
  428 }
  429 
  430 static int
  431 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
  432 {
  433         struct thread *td = curthread;
  434         struct vnode *vp;
  435         struct file *fp;
  436         int error;
  437         int fd;
  438 
  439         if (req->newptr == NULL)
  440                 return (EBADF);
  441 
  442         error = sysctl_handle_int(oidp, &fd, 0, req);
  443         if (error != 0)
  444                 return (error);
  445         error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
  446         if (error != 0)
  447                 return (error);
  448         vp = fp->f_vnode;
  449 
  450         error = vn_lock(vp, LK_EXCLUSIVE);
  451         if (error != 0)
  452                 goto drop;
  453 
  454         counter_u64_add(recycles_count, 1);
  455         vgone(vp);
  456         VOP_UNLOCK(vp);
  457 drop:
  458         fdrop(fp, td);
  459         return (error);
  460 }
  461 
  462 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
  463     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
  464     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
  465 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
  466     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
  467     sysctl_ftry_reclaim_vnode, "I",
  468     "Try to reclaim a vnode by its file descriptor");
  469 
  470 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
  471 #define vnsz2log 8
  472 #ifndef DEBUG_LOCKS
  473 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log &&
  474     sizeof(struct vnode) < 1UL << (vnsz2log + 1),
  475     "vnsz2log needs to be updated");
  476 #endif
  477 
  478 /*
  479  * Support for the bufobj clean & dirty pctrie.
  480  */
  481 static void *
  482 buf_trie_alloc(struct pctrie *ptree)
  483 {
  484         return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
  485 }
  486 
  487 static void
  488 buf_trie_free(struct pctrie *ptree, void *node)
  489 {
  490         uma_zfree_smr(buf_trie_zone, node);
  491 }
  492 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
  493     buf_trie_smr);
  494 
  495 /*
  496  * Initialize the vnode management data structures.
  497  *
  498  * Reevaluate the following cap on the number of vnodes after the physical
  499  * memory size exceeds 512GB.  In the limit, as the physical memory size
  500  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
  501  */
  502 #ifndef MAXVNODES_MAX
  503 #define MAXVNODES_MAX   (512UL * 1024 * 1024 / 64)      /* 8M */
  504 #endif
  505 
  506 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
  507 
  508 static struct vnode *
  509 vn_alloc_marker(struct mount *mp)
  510 {
  511         struct vnode *vp;
  512 
  513         vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
  514         vp->v_type = VMARKER;
  515         vp->v_mount = mp;
  516 
  517         return (vp);
  518 }
  519 
  520 static void
  521 vn_free_marker(struct vnode *vp)
  522 {
  523 
  524         MPASS(vp->v_type == VMARKER);
  525         free(vp, M_VNODE_MARKER);
  526 }
  527 
  528 #ifdef KASAN
  529 static int
  530 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused)
  531 {
  532         kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0);
  533         return (0);
  534 }
  535 
  536 static void
  537 vnode_dtor(void *mem, int size, void *arg __unused)
  538 {
  539         size_t end1, end2, off1, off2;
  540 
  541         _Static_assert(offsetof(struct vnode, v_vnodelist) <
  542             offsetof(struct vnode, v_dbatchcpu),
  543             "KASAN marks require updating");
  544 
  545         off1 = offsetof(struct vnode, v_vnodelist);
  546         off2 = offsetof(struct vnode, v_dbatchcpu);
  547         end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist);
  548         end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu);
  549 
  550         /*
  551          * Access to the v_vnodelist and v_dbatchcpu fields are permitted even
  552          * after the vnode has been freed.  Try to get some KASAN coverage by
  553          * marking everything except those two fields as invalid.  Because
  554          * KASAN's tracking is not byte-granular, any preceding fields sharing
  555          * the same 8-byte aligned word must also be marked valid.
  556          */
  557 
  558         /* Handle the area from the start until v_vnodelist... */
  559         off1 = rounddown2(off1, KASAN_SHADOW_SCALE);
  560         kasan_mark(mem, off1, off1, KASAN_UMA_FREED);
  561 
  562         /* ... then the area between v_vnodelist and v_dbatchcpu ... */
  563         off1 = roundup2(end1, KASAN_SHADOW_SCALE);
  564         off2 = rounddown2(off2, KASAN_SHADOW_SCALE);
  565         if (off2 > off1)
  566                 kasan_mark((void *)((char *)mem + off1), off2 - off1,
  567                     off2 - off1, KASAN_UMA_FREED);
  568 
  569         /* ... and finally the area from v_dbatchcpu to the end. */
  570         off2 = roundup2(end2, KASAN_SHADOW_SCALE);
  571         kasan_mark((void *)((char *)mem + off2), size - off2, size - off2,
  572             KASAN_UMA_FREED);
  573 }
  574 #endif /* KASAN */
  575 
  576 /*
  577  * Initialize a vnode as it first enters the zone.
  578  */
  579 static int
  580 vnode_init(void *mem, int size, int flags)
  581 {
  582         struct vnode *vp;
  583 
  584         vp = mem;
  585         bzero(vp, size);
  586         /*
  587          * Setup locks.
  588          */
  589         vp->v_vnlock = &vp->v_lock;
  590         mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
  591         /*
  592          * By default, don't allow shared locks unless filesystems opt-in.
  593          */
  594         lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
  595             LK_NOSHARE | LK_IS_VNODE);
  596         /*
  597          * Initialize bufobj.
  598          */
  599         bufobj_init(&vp->v_bufobj, vp);
  600         /*
  601          * Initialize namecache.
  602          */
  603         cache_vnode_init(vp);
  604         /*
  605          * Initialize rangelocks.
  606          */
  607         rangelock_init(&vp->v_rl);
  608 
  609         vp->v_dbatchcpu = NOCPU;
  610 
  611         vp->v_state = VSTATE_DEAD;
  612 
  613         /*
  614          * Check vhold_recycle_free for an explanation.
  615          */
  616         vp->v_holdcnt = VHOLD_NO_SMR;
  617         vp->v_type = VNON;
  618         mtx_lock(&vnode_list_mtx);
  619         TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
  620         mtx_unlock(&vnode_list_mtx);
  621         return (0);
  622 }
  623 
  624 /*
  625  * Free a vnode when it is cleared from the zone.
  626  */
  627 static void
  628 vnode_fini(void *mem, int size)
  629 {
  630         struct vnode *vp;
  631         struct bufobj *bo;
  632 
  633         vp = mem;
  634         vdbatch_dequeue(vp);
  635         mtx_lock(&vnode_list_mtx);
  636         TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
  637         mtx_unlock(&vnode_list_mtx);
  638         rangelock_destroy(&vp->v_rl);
  639         lockdestroy(vp->v_vnlock);
  640         mtx_destroy(&vp->v_interlock);
  641         bo = &vp->v_bufobj;
  642         rw_destroy(BO_LOCKPTR(bo));
  643 
  644         kasan_mark(mem, size, size, 0);
  645 }
  646 
  647 /*
  648  * Provide the size of NFS nclnode and NFS fh for calculation of the
  649  * vnode memory consumption.  The size is specified directly to
  650  * eliminate dependency on NFS-private header.
  651  *
  652  * Other filesystems may use bigger or smaller (like UFS and ZFS)
  653  * private inode data, but the NFS-based estimation is ample enough.
  654  * Still, we care about differences in the size between 64- and 32-bit
  655  * platforms.
  656  *
  657  * Namecache structure size is heuristically
  658  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
  659  */
  660 #ifdef _LP64
  661 #define NFS_NCLNODE_SZ  (528 + 64)
  662 #define NC_SZ           148
  663 #else
  664 #define NFS_NCLNODE_SZ  (360 + 32)
  665 #define NC_SZ           92
  666 #endif
  667 
  668 static void
  669 vntblinit(void *dummy __unused)
  670 {
  671         struct vdbatch *vd;
  672         uma_ctor ctor;
  673         uma_dtor dtor;
  674         int cpu, physvnodes, virtvnodes;
  675 
  676         /*
  677          * Desiredvnodes is a function of the physical memory size and the
  678          * kernel's heap size.  Generally speaking, it scales with the
  679          * physical memory size.  The ratio of desiredvnodes to the physical
  680          * memory size is 1:16 until desiredvnodes exceeds 98,304.
  681          * Thereafter, the
  682          * marginal ratio of desiredvnodes to the physical memory size is
  683          * 1:64.  However, desiredvnodes is limited by the kernel's heap
  684          * size.  The memory required by desiredvnodes vnodes and vm objects
  685          * must not exceed 1/10th of the kernel's heap size.
  686          */
  687         physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
  688             3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
  689         virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
  690             sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
  691         desiredvnodes = min(physvnodes, virtvnodes);
  692         if (desiredvnodes > MAXVNODES_MAX) {
  693                 if (bootverbose)
  694                         printf("Reducing kern.maxvnodes %lu -> %lu\n",
  695                             desiredvnodes, MAXVNODES_MAX);
  696                 desiredvnodes = MAXVNODES_MAX;
  697         }
  698         wantfreevnodes = desiredvnodes / 4;
  699         mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
  700         TAILQ_INIT(&vnode_list);
  701         mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
  702         /*
  703          * The lock is taken to appease WITNESS.
  704          */
  705         mtx_lock(&vnode_list_mtx);
  706         vnlru_recalc();
  707         mtx_unlock(&vnode_list_mtx);
  708         vnode_list_free_marker = vn_alloc_marker(NULL);
  709         TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
  710         vnode_list_reclaim_marker = vn_alloc_marker(NULL);
  711         TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
  712 
  713 #ifdef KASAN
  714         ctor = vnode_ctor;
  715         dtor = vnode_dtor;
  716 #else
  717         ctor = NULL;
  718         dtor = NULL;
  719 #endif
  720         vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor,
  721             vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN);
  722         uma_zone_set_smr(vnode_zone, vfs_smr);
  723 
  724         /*
  725          * Preallocate enough nodes to support one-per buf so that
  726          * we can not fail an insert.  reassignbuf() callers can not
  727          * tolerate the insertion failure.
  728          */
  729         buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
  730             NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
  731             UMA_ZONE_NOFREE | UMA_ZONE_SMR);
  732         buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
  733         uma_prealloc(buf_trie_zone, nbuf);
  734 
  735         vnodes_created = counter_u64_alloc(M_WAITOK);
  736         recycles_count = counter_u64_alloc(M_WAITOK);
  737         recycles_free_count = counter_u64_alloc(M_WAITOK);
  738         deferred_inact = counter_u64_alloc(M_WAITOK);
  739 
  740         /*
  741          * Initialize the filesystem syncer.
  742          */
  743         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
  744             &syncer_mask);
  745         syncer_maxdelay = syncer_mask + 1;
  746         mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
  747         cv_init(&sync_wakeup, "syncer");
  748 
  749         CPU_FOREACH(cpu) {
  750                 vd = DPCPU_ID_PTR((cpu), vd);
  751                 bzero(vd, sizeof(*vd));
  752                 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
  753         }
  754 }
  755 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
  756 
  757 /*
  758  * Mark a mount point as busy. Used to synchronize access and to delay
  759  * unmounting. Eventually, mountlist_mtx is not released on failure.
  760  *
  761  * vfs_busy() is a custom lock, it can block the caller.
  762  * vfs_busy() only sleeps if the unmount is active on the mount point.
  763  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  764  * vnode belonging to mp.
  765  *
  766  * Lookup uses vfs_busy() to traverse mount points.
  767  * root fs                      var fs
  768  * / vnode lock         A       / vnode lock (/var)             D
  769  * /var vnode lock      B       /log vnode lock(/var/log)       E
  770  * vfs_busy lock        C       vfs_busy lock                   F
  771  *
  772  * Within each file system, the lock order is C->A->B and F->D->E.
  773  *
  774  * When traversing across mounts, the system follows that lock order:
  775  *
  776  *        C->A->B
  777  *              |
  778  *              +->F->D->E
  779  *
  780  * The lookup() process for namei("/var") illustrates the process:
  781  *  1. VOP_LOOKUP() obtains B while A is held
  782  *  2. vfs_busy() obtains a shared lock on F while A and B are held
  783  *  3. vput() releases lock on B
  784  *  4. vput() releases lock on A
  785  *  5. VFS_ROOT() obtains lock on D while shared lock on F is held
  786  *  6. vfs_unbusy() releases shared lock on F
  787  *  7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  788  *     Attempt to lock A (instead of vp_crossmp) while D is held would
  789  *     violate the global order, causing deadlocks.
  790  *
  791  * dounmount() locks B while F is drained.  Note that for stacked
  792  * filesystems, D and B in the example above may be the same lock,
  793  * which introdues potential lock order reversal deadlock between
  794  * dounmount() and step 5 above.  These filesystems may avoid the LOR
  795  * by setting VV_CROSSLOCK on the covered vnode so that lock B will
  796  * remain held until after step 5.
  797  */
  798 int
  799 vfs_busy(struct mount *mp, int flags)
  800 {
  801         struct mount_pcpu *mpcpu;
  802 
  803         MPASS((flags & ~MBF_MASK) == 0);
  804         CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
  805 
  806         if (vfs_op_thread_enter(mp, mpcpu)) {
  807                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
  808                 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
  809                 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
  810                 vfs_mp_count_add_pcpu(mpcpu, ref, 1);
  811                 vfs_mp_count_add_pcpu(mpcpu, lockref, 1);
  812                 vfs_op_thread_exit(mp, mpcpu);
  813                 if (flags & MBF_MNTLSTLOCK)
  814                         mtx_unlock(&mountlist_mtx);
  815                 return (0);
  816         }
  817 
  818         MNT_ILOCK(mp);
  819         vfs_assert_mount_counters(mp);
  820         MNT_REF(mp);
  821         /*
  822          * If mount point is currently being unmounted, sleep until the
  823          * mount point fate is decided.  If thread doing the unmounting fails,
  824          * it will clear MNTK_UNMOUNT flag before waking us up, indicating
  825          * that this mount point has survived the unmount attempt and vfs_busy
  826          * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
  827          * flag in addition to MNTK_UNMOUNT, indicating that mount point is
  828          * about to be really destroyed.  vfs_busy needs to release its
  829          * reference on the mount point in this case and return with ENOENT,
  830          * telling the caller the mount it tried to busy is no longer valid.
  831          */
  832         while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
  833                 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers),
  834                     ("%s: non-empty upper mount list with pending unmount",
  835                     __func__));
  836                 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
  837                         MNT_REL(mp);
  838                         MNT_IUNLOCK(mp);
  839                         CTR1(KTR_VFS, "%s: failed busying before sleeping",
  840                             __func__);
  841                         return (ENOENT);
  842                 }
  843                 if (flags & MBF_MNTLSTLOCK)
  844                         mtx_unlock(&mountlist_mtx);
  845                 mp->mnt_kern_flag |= MNTK_MWAIT;
  846                 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
  847                 if (flags & MBF_MNTLSTLOCK)
  848                         mtx_lock(&mountlist_mtx);
  849                 MNT_ILOCK(mp);
  850         }
  851         if (flags & MBF_MNTLSTLOCK)
  852                 mtx_unlock(&mountlist_mtx);
  853         mp->mnt_lockref++;
  854         MNT_IUNLOCK(mp);
  855         return (0);
  856 }
  857 
  858 /*
  859  * Free a busy filesystem.
  860  */
  861 void
  862 vfs_unbusy(struct mount *mp)
  863 {
  864         struct mount_pcpu *mpcpu;
  865         int c;
  866 
  867         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
  868 
  869         if (vfs_op_thread_enter(mp, mpcpu)) {
  870                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
  871                 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1);
  872                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
  873                 vfs_op_thread_exit(mp, mpcpu);
  874                 return;
  875         }
  876 
  877         MNT_ILOCK(mp);
  878         vfs_assert_mount_counters(mp);
  879         MNT_REL(mp);
  880         c = --mp->mnt_lockref;
  881         if (mp->mnt_vfs_ops == 0) {
  882                 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
  883                 MNT_IUNLOCK(mp);
  884                 return;
  885         }
  886         if (c < 0)
  887                 vfs_dump_mount_counters(mp);
  888         if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
  889                 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
  890                 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
  891                 mp->mnt_kern_flag &= ~MNTK_DRAINING;
  892                 wakeup(&mp->mnt_lockref);
  893         }
  894         MNT_IUNLOCK(mp);
  895 }
  896 
  897 /*
  898  * Lookup a mount point by filesystem identifier.
  899  */
  900 struct mount *
  901 vfs_getvfs(fsid_t *fsid)
  902 {
  903         struct mount *mp;
  904 
  905         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
  906         mtx_lock(&mountlist_mtx);
  907         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  908                 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
  909                         vfs_ref(mp);
  910                         mtx_unlock(&mountlist_mtx);
  911                         return (mp);
  912                 }
  913         }
  914         mtx_unlock(&mountlist_mtx);
  915         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
  916         return ((struct mount *) 0);
  917 }
  918 
  919 /*
  920  * Lookup a mount point by filesystem identifier, busying it before
  921  * returning.
  922  *
  923  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  924  * cache for popular filesystem identifiers.  The cache is lockess, using
  925  * the fact that struct mount's are never freed.  In worst case we may
  926  * get pointer to unmounted or even different filesystem, so we have to
  927  * check what we got, and go slow way if so.
  928  */
  929 struct mount *
  930 vfs_busyfs(fsid_t *fsid)
  931 {
  932 #define FSID_CACHE_SIZE 256
  933         typedef struct mount * volatile vmp_t;
  934         static vmp_t cache[FSID_CACHE_SIZE];
  935         struct mount *mp;
  936         int error;
  937         uint32_t hash;
  938 
  939         CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
  940         hash = fsid->val[0] ^ fsid->val[1];
  941         hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
  942         mp = cache[hash];
  943         if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
  944                 goto slow;
  945         if (vfs_busy(mp, 0) != 0) {
  946                 cache[hash] = NULL;
  947                 goto slow;
  948         }
  949         if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
  950                 return (mp);
  951         else
  952             vfs_unbusy(mp);
  953 
  954 slow:
  955         mtx_lock(&mountlist_mtx);
  956         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  957                 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
  958                         error = vfs_busy(mp, MBF_MNTLSTLOCK);
  959                         if (error) {
  960                                 cache[hash] = NULL;
  961                                 mtx_unlock(&mountlist_mtx);
  962                                 return (NULL);
  963                         }
  964                         cache[hash] = mp;
  965                         return (mp);
  966                 }
  967         }
  968         CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
  969         mtx_unlock(&mountlist_mtx);
  970         return ((struct mount *) 0);
  971 }
  972 
  973 /*
  974  * Check if a user can access privileged mount options.
  975  */
  976 int
  977 vfs_suser(struct mount *mp, struct thread *td)
  978 {
  979         int error;
  980 
  981         if (jailed(td->td_ucred)) {
  982                 /*
  983                  * If the jail of the calling thread lacks permission for
  984                  * this type of file system, deny immediately.
  985                  */
  986                 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
  987                         return (EPERM);
  988 
  989                 /*
  990                  * If the file system was mounted outside the jail of the
  991                  * calling thread, deny immediately.
  992                  */
  993                 if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
  994                         return (EPERM);
  995         }
  996 
  997         /*
  998          * If file system supports delegated administration, we don't check
  999          * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 1000          * by the file system itself.
 1001          * If this is not the user that did original mount, we check for
 1002          * the PRIV_VFS_MOUNT_OWNER privilege.
 1003          */
 1004         if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 1005             mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 1006                 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 1007                         return (error);
 1008         }
 1009         return (0);
 1010 }
 1011 
 1012 /*
 1013  * Get a new unique fsid.  Try to make its val[0] unique, since this value
 1014  * will be used to create fake device numbers for stat().  Also try (but
 1015  * not so hard) make its val[0] unique mod 2^16, since some emulators only
 1016  * support 16-bit device numbers.  We end up with unique val[0]'s for the
 1017  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
 1018  *
 1019  * Keep in mind that several mounts may be running in parallel.  Starting
 1020  * the search one past where the previous search terminated is both a
 1021  * micro-optimization and a defense against returning the same fsid to
 1022  * different mounts.
 1023  */
 1024 void
 1025 vfs_getnewfsid(struct mount *mp)
 1026 {
 1027         static uint16_t mntid_base;
 1028         struct mount *nmp;
 1029         fsid_t tfsid;
 1030         int mtype;
 1031 
 1032         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 1033         mtx_lock(&mntid_mtx);
 1034         mtype = mp->mnt_vfc->vfc_typenum;
 1035         tfsid.val[1] = mtype;
 1036         mtype = (mtype & 0xFF) << 24;
 1037         for (;;) {
 1038                 tfsid.val[0] = makedev(255,
 1039                     mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 1040                 mntid_base++;
 1041                 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 1042                         break;
 1043                 vfs_rel(nmp);
 1044         }
 1045         mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 1046         mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 1047         mtx_unlock(&mntid_mtx);
 1048 }
 1049 
 1050 /*
 1051  * Knob to control the precision of file timestamps:
 1052  *
 1053  *   0 = seconds only; nanoseconds zeroed.
 1054  *   1 = seconds and nanoseconds, accurate within 1/HZ.
 1055  *   2 = seconds and nanoseconds, truncated to microseconds.
 1056  * >=3 = seconds and nanoseconds, maximum precision.
 1057  */
 1058 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 1059 
 1060 static int timestamp_precision = TSP_USEC;
 1061 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
 1062     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
 1063     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
 1064     "3+: sec + ns (max. precision))");
 1065 
 1066 /*
 1067  * Get a current timestamp.
 1068  */
 1069 void
 1070 vfs_timestamp(struct timespec *tsp)
 1071 {
 1072         struct timeval tv;
 1073 
 1074         switch (timestamp_precision) {
 1075         case TSP_SEC:
 1076                 tsp->tv_sec = time_second;
 1077                 tsp->tv_nsec = 0;
 1078                 break;
 1079         case TSP_HZ:
 1080                 getnanotime(tsp);
 1081                 break;
 1082         case TSP_USEC:
 1083                 microtime(&tv);
 1084                 TIMEVAL_TO_TIMESPEC(&tv, tsp);
 1085                 break;
 1086         case TSP_NSEC:
 1087         default:
 1088                 nanotime(tsp);
 1089                 break;
 1090         }
 1091 }
 1092 
 1093 /*
 1094  * Set vnode attributes to VNOVAL
 1095  */
 1096 void
 1097 vattr_null(struct vattr *vap)
 1098 {
 1099 
 1100         vap->va_type = VNON;
 1101         vap->va_size = VNOVAL;
 1102         vap->va_bytes = VNOVAL;
 1103         vap->va_mode = VNOVAL;
 1104         vap->va_nlink = VNOVAL;
 1105         vap->va_uid = VNOVAL;
 1106         vap->va_gid = VNOVAL;
 1107         vap->va_fsid = VNOVAL;
 1108         vap->va_fileid = VNOVAL;
 1109         vap->va_blocksize = VNOVAL;
 1110         vap->va_rdev = VNOVAL;
 1111         vap->va_atime.tv_sec = VNOVAL;
 1112         vap->va_atime.tv_nsec = VNOVAL;
 1113         vap->va_mtime.tv_sec = VNOVAL;
 1114         vap->va_mtime.tv_nsec = VNOVAL;
 1115         vap->va_ctime.tv_sec = VNOVAL;
 1116         vap->va_ctime.tv_nsec = VNOVAL;
 1117         vap->va_birthtime.tv_sec = VNOVAL;
 1118         vap->va_birthtime.tv_nsec = VNOVAL;
 1119         vap->va_flags = VNOVAL;
 1120         vap->va_gen = VNOVAL;
 1121         vap->va_vaflags = 0;
 1122 }
 1123 
 1124 /*
 1125  * Try to reduce the total number of vnodes.
 1126  *
 1127  * This routine (and its user) are buggy in at least the following ways:
 1128  * - all parameters were picked years ago when RAM sizes were significantly
 1129  *   smaller
 1130  * - it can pick vnodes based on pages used by the vm object, but filesystems
 1131  *   like ZFS don't use it making the pick broken
 1132  * - since ZFS has its own aging policy it gets partially combated by this one
 1133  * - a dedicated method should be provided for filesystems to let them decide
 1134  *   whether the vnode should be recycled
 1135  *
 1136  * This routine is called when we have too many vnodes.  It attempts
 1137  * to free <count> vnodes and will potentially free vnodes that still
 1138  * have VM backing store (VM backing store is typically the cause
 1139  * of a vnode blowout so we want to do this).  Therefore, this operation
 1140  * is not considered cheap.
 1141  *
 1142  * A number of conditions may prevent a vnode from being reclaimed.
 1143  * the buffer cache may have references on the vnode, a directory
 1144  * vnode may still have references due to the namei cache representing
 1145  * underlying files, or the vnode may be in active use.   It is not
 1146  * desirable to reuse such vnodes.  These conditions may cause the
 1147  * number of vnodes to reach some minimum value regardless of what
 1148  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
 1149  *
 1150  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
 1151  *                       entries if this argument is strue
 1152  * @param trigger        Only reclaim vnodes with fewer than this many resident
 1153  *                       pages.
 1154  * @param target         How many vnodes to reclaim.
 1155  * @return               The number of vnodes that were reclaimed.
 1156  */
 1157 static int
 1158 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
 1159 {
 1160         struct vnode *vp, *mvp;
 1161         struct mount *mp;
 1162         struct vm_object *object;
 1163         u_long done;
 1164         bool retried;
 1165 
 1166         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1167 
 1168         retried = false;
 1169         done = 0;
 1170 
 1171         mvp = vnode_list_reclaim_marker;
 1172 restart:
 1173         vp = mvp;
 1174         while (done < target) {
 1175                 vp = TAILQ_NEXT(vp, v_vnodelist);
 1176                 if (__predict_false(vp == NULL))
 1177                         break;
 1178 
 1179                 if (__predict_false(vp->v_type == VMARKER))
 1180                         continue;
 1181 
 1182                 /*
 1183                  * If it's been deconstructed already, it's still
 1184                  * referenced, or it exceeds the trigger, skip it.
 1185                  * Also skip free vnodes.  We are trying to make space
 1186                  * to expand the free list, not reduce it.
 1187                  */
 1188                 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
 1189                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
 1190                         goto next_iter;
 1191 
 1192                 if (vp->v_type == VBAD || vp->v_type == VNON)
 1193                         goto next_iter;
 1194 
 1195                 object = atomic_load_ptr(&vp->v_object);
 1196                 if (object == NULL || object->resident_page_count > trigger) {
 1197                         goto next_iter;
 1198                 }
 1199 
 1200                 /*
 1201                  * Handle races against vnode allocation. Filesystems lock the
 1202                  * vnode some time after it gets returned from getnewvnode,
 1203                  * despite type and hold count being manipulated earlier.
 1204                  * Resorting to checking v_mount restores guarantees present
 1205                  * before the global list was reworked to contain all vnodes.
 1206                  */
 1207                 if (!VI_TRYLOCK(vp))
 1208                         goto next_iter;
 1209                 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 1210                         VI_UNLOCK(vp);
 1211                         goto next_iter;
 1212                 }
 1213                 if (vp->v_mount == NULL) {
 1214                         VI_UNLOCK(vp);
 1215                         goto next_iter;
 1216                 }
 1217                 vholdl(vp);
 1218                 VI_UNLOCK(vp);
 1219                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1220                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 1221                 mtx_unlock(&vnode_list_mtx);
 1222 
 1223                 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 1224                         vdrop_recycle(vp);
 1225                         goto next_iter_unlocked;
 1226                 }
 1227                 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
 1228                         vdrop_recycle(vp);
 1229                         vn_finished_write(mp);
 1230                         goto next_iter_unlocked;
 1231                 }
 1232 
 1233                 VI_LOCK(vp);
 1234                 if (vp->v_usecount > 0 ||
 1235                     (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 1236                     (vp->v_object != NULL && vp->v_object->handle == vp &&
 1237                     vp->v_object->resident_page_count > trigger)) {
 1238                         VOP_UNLOCK(vp);
 1239                         vdropl_recycle(vp);
 1240                         vn_finished_write(mp);
 1241                         goto next_iter_unlocked;
 1242                 }
 1243                 counter_u64_add(recycles_count, 1);
 1244                 vgonel(vp);
 1245                 VOP_UNLOCK(vp);
 1246                 vdropl_recycle(vp);
 1247                 vn_finished_write(mp);
 1248                 done++;
 1249 next_iter_unlocked:
 1250                 if (should_yield())
 1251                         kern_yield(PRI_USER);
 1252                 mtx_lock(&vnode_list_mtx);
 1253                 goto restart;
 1254 next_iter:
 1255                 MPASS(vp->v_type != VMARKER);
 1256                 if (!should_yield())
 1257                         continue;
 1258                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1259                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 1260                 mtx_unlock(&vnode_list_mtx);
 1261                 kern_yield(PRI_USER);
 1262                 mtx_lock(&vnode_list_mtx);
 1263                 goto restart;
 1264         }
 1265         if (done == 0 && !retried) {
 1266                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1267                 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
 1268                 retried = true;
 1269                 goto restart;
 1270         }
 1271         return (done);
 1272 }
 1273 
 1274 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
 1275 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
 1276     0,
 1277     "limit on vnode free requests per call to the vnlru_free routine");
 1278 
 1279 /*
 1280  * Attempt to reduce the free list by the requested amount.
 1281  */
 1282 static int
 1283 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 1284 {
 1285         struct vnode *vp;
 1286         struct mount *mp;
 1287         int ocount;
 1288 
 1289         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1290         if (count > max_vnlru_free)
 1291                 count = max_vnlru_free;
 1292         ocount = count;
 1293         vp = mvp;
 1294         for (;;) {
 1295                 if (count == 0) {
 1296                         break;
 1297                 }
 1298                 vp = TAILQ_NEXT(vp, v_vnodelist);
 1299                 if (__predict_false(vp == NULL)) {
 1300                         TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1301                         TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
 1302                         break;
 1303                 }
 1304                 if (__predict_false(vp->v_type == VMARKER))
 1305                         continue;
 1306                 if (vp->v_holdcnt > 0)
 1307                         continue;
 1308                 /*
 1309                  * Don't recycle if our vnode is from different type
 1310                  * of mount point.  Note that mp is type-safe, the
 1311                  * check does not reach unmapped address even if
 1312                  * vnode is reclaimed.
 1313                  */
 1314                 if (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
 1315                     mp->mnt_op != mnt_op) {
 1316                         continue;
 1317                 }
 1318                 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
 1319                         continue;
 1320                 }
 1321                 if (!vhold_recycle_free(vp))
 1322                         continue;
 1323                 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1324                 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
 1325                 mtx_unlock(&vnode_list_mtx);
 1326                 /*
 1327                  * FIXME: ignores the return value, meaning it may be nothing
 1328                  * got recycled but it claims otherwise to the caller.
 1329                  *
 1330                  * Originally the value started being ignored in 2005 with
 1331                  * 114a1006a8204aa156e1f9ad6476cdff89cada7f .
 1332                  *
 1333                  * Respecting the value can run into significant stalls if most
 1334                  * vnodes belong to one file system and it has writes
 1335                  * suspended.  In presence of many threads and millions of
 1336                  * vnodes they keep contending on the vnode_list_mtx lock only
 1337                  * to find vnodes they can't recycle.
 1338                  *
 1339                  * The solution would be to pre-check if the vnode is likely to
 1340                  * be recycle-able, but it needs to happen with the
 1341                  * vnode_list_mtx lock held. This runs into a problem where
 1342                  * VOP_GETWRITEMOUNT (currently needed to find out about if
 1343                  * writes are frozen) can take locks which LOR against it.
 1344                  *
 1345                  * Check nullfs for one example (null_getwritemount).
 1346                  */
 1347                 vtryrecycle(vp);
 1348                 count--;
 1349                 mtx_lock(&vnode_list_mtx);
 1350                 vp = mvp;
 1351         }
 1352         return (ocount - count);
 1353 }
 1354 
 1355 static int
 1356 vnlru_free_locked(int count)
 1357 {
 1358 
 1359         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1360         return (vnlru_free_impl(count, NULL, vnode_list_free_marker));
 1361 }
 1362 
 1363 void
 1364 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
 1365 {
 1366 
 1367         MPASS(mnt_op != NULL);
 1368         MPASS(mvp != NULL);
 1369         VNPASS(mvp->v_type == VMARKER, mvp);
 1370         mtx_lock(&vnode_list_mtx);
 1371         vnlru_free_impl(count, mnt_op, mvp);
 1372         mtx_unlock(&vnode_list_mtx);
 1373 }
 1374 
 1375 struct vnode *
 1376 vnlru_alloc_marker(void)
 1377 {
 1378         struct vnode *mvp;
 1379 
 1380         mvp = vn_alloc_marker(NULL);
 1381         mtx_lock(&vnode_list_mtx);
 1382         TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
 1383         mtx_unlock(&vnode_list_mtx);
 1384         return (mvp);
 1385 }
 1386 
 1387 void
 1388 vnlru_free_marker(struct vnode *mvp)
 1389 {
 1390         mtx_lock(&vnode_list_mtx);
 1391         TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 1392         mtx_unlock(&vnode_list_mtx);
 1393         vn_free_marker(mvp);
 1394 }
 1395 
 1396 static void
 1397 vnlru_recalc(void)
 1398 {
 1399 
 1400         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1401         gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
 1402         vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
 1403         vlowat = vhiwat / 2;
 1404 }
 1405 
 1406 /*
 1407  * Attempt to recycle vnodes in a context that is always safe to block.
 1408  * Calling vlrurecycle() from the bowels of filesystem code has some
 1409  * interesting deadlock problems.
 1410  */
 1411 static struct proc *vnlruproc;
 1412 static int vnlruproc_sig;
 1413 
 1414 /*
 1415  * The main freevnodes counter is only updated when threads requeue their vnode
 1416  * batches. CPUs are conditionally walked to compute a more accurate total.
 1417  *
 1418  * Limit how much of a slop are we willing to tolerate. Note: the actual value
 1419  * at any given moment can still exceed slop, but it should not be by significant
 1420  * margin in practice.
 1421  */
 1422 #define VNLRU_FREEVNODES_SLOP 128
 1423 
 1424 static __inline void
 1425 vfs_freevnodes_inc(void)
 1426 {
 1427         struct vdbatch *vd;
 1428 
 1429         critical_enter();
 1430         vd = DPCPU_PTR(vd);
 1431         vd->freevnodes++;
 1432         critical_exit();
 1433 }
 1434 
 1435 static __inline void
 1436 vfs_freevnodes_dec(void)
 1437 {
 1438         struct vdbatch *vd;
 1439 
 1440         critical_enter();
 1441         vd = DPCPU_PTR(vd);
 1442         vd->freevnodes--;
 1443         critical_exit();
 1444 }
 1445 
 1446 static u_long
 1447 vnlru_read_freevnodes(void)
 1448 {
 1449         struct vdbatch *vd;
 1450         long slop;
 1451         int cpu;
 1452 
 1453         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1454         if (freevnodes > freevnodes_old)
 1455                 slop = freevnodes - freevnodes_old;
 1456         else
 1457                 slop = freevnodes_old - freevnodes;
 1458         if (slop < VNLRU_FREEVNODES_SLOP)
 1459                 return (freevnodes >= 0 ? freevnodes : 0);
 1460         freevnodes_old = freevnodes;
 1461         CPU_FOREACH(cpu) {
 1462                 vd = DPCPU_ID_PTR((cpu), vd);
 1463                 freevnodes_old += vd->freevnodes;
 1464         }
 1465         return (freevnodes_old >= 0 ? freevnodes_old : 0);
 1466 }
 1467 
 1468 static bool
 1469 vnlru_under(u_long rnumvnodes, u_long limit)
 1470 {
 1471         u_long rfreevnodes, space;
 1472 
 1473         if (__predict_false(rnumvnodes > desiredvnodes))
 1474                 return (true);
 1475 
 1476         space = desiredvnodes - rnumvnodes;
 1477         if (space < limit) {
 1478                 rfreevnodes = vnlru_read_freevnodes();
 1479                 if (rfreevnodes > wantfreevnodes)
 1480                         space += rfreevnodes - wantfreevnodes;
 1481         }
 1482         return (space < limit);
 1483 }
 1484 
 1485 static bool
 1486 vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
 1487 {
 1488         long rfreevnodes, space;
 1489 
 1490         if (__predict_false(rnumvnodes > desiredvnodes))
 1491                 return (true);
 1492 
 1493         space = desiredvnodes - rnumvnodes;
 1494         if (space < limit) {
 1495                 rfreevnodes = atomic_load_long(&freevnodes);
 1496                 if (rfreevnodes > wantfreevnodes)
 1497                         space += rfreevnodes - wantfreevnodes;
 1498         }
 1499         return (space < limit);
 1500 }
 1501 
 1502 static void
 1503 vnlru_kick(void)
 1504 {
 1505 
 1506         mtx_assert(&vnode_list_mtx, MA_OWNED);
 1507         if (vnlruproc_sig == 0) {
 1508                 vnlruproc_sig = 1;
 1509                 wakeup(vnlruproc);
 1510         }
 1511 }
 1512 
 1513 static void
 1514 vnlru_proc(void)
 1515 {
 1516         u_long rnumvnodes, rfreevnodes, target;
 1517         unsigned long onumvnodes;
 1518         int done, force, trigger, usevnodes;
 1519         bool reclaim_nc_src, want_reread;
 1520 
 1521         EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 1522             SHUTDOWN_PRI_FIRST);
 1523 
 1524         force = 0;
 1525         want_reread = false;
 1526         for (;;) {
 1527                 kproc_suspend_check(vnlruproc);
 1528                 mtx_lock(&vnode_list_mtx);
 1529                 rnumvnodes = atomic_load_long(&numvnodes);
 1530 
 1531                 if (want_reread) {
 1532                         force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
 1533                         want_reread = false;
 1534                 }
 1535 
 1536                 /*
 1537                  * If numvnodes is too large (due to desiredvnodes being
 1538                  * adjusted using its sysctl, or emergency growth), first
 1539                  * try to reduce it by discarding from the free list.
 1540                  */
 1541                 if (rnumvnodes > desiredvnodes) {
 1542                         vnlru_free_locked(rnumvnodes - desiredvnodes);
 1543                         rnumvnodes = atomic_load_long(&numvnodes);
 1544                 }
 1545                 /*
 1546                  * Sleep if the vnode cache is in a good state.  This is
 1547                  * when it is not over-full and has space for about a 4%
 1548                  * or 9% expansion (by growing its size or inexcessively
 1549                  * reducing its free list).  Otherwise, try to reclaim
 1550                  * space for a 10% expansion.
 1551                  */
 1552                 if (vstir && force == 0) {
 1553                         force = 1;
 1554                         vstir = 0;
 1555                 }
 1556                 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
 1557                         vnlruproc_sig = 0;
 1558                         wakeup(&vnlruproc_sig);
 1559                         msleep(vnlruproc, &vnode_list_mtx,
 1560                             PVFS|PDROP, "vlruwt", hz);
 1561                         continue;
 1562                 }
 1563                 rfreevnodes = vnlru_read_freevnodes();
 1564 
 1565                 onumvnodes = rnumvnodes;
 1566                 /*
 1567                  * Calculate parameters for recycling.  These are the same
 1568                  * throughout the loop to give some semblance of fairness.
 1569                  * The trigger point is to avoid recycling vnodes with lots
 1570                  * of resident pages.  We aren't trying to free memory; we
 1571                  * are trying to recycle or at least free vnodes.
 1572                  */
 1573                 if (rnumvnodes <= desiredvnodes)
 1574                         usevnodes = rnumvnodes - rfreevnodes;
 1575                 else
 1576                         usevnodes = rnumvnodes;
 1577                 if (usevnodes <= 0)
 1578                         usevnodes = 1;
 1579                 /*
 1580                  * The trigger value is chosen to give a conservatively
 1581                  * large value to ensure that it alone doesn't prevent
 1582                  * making progress.  The value can easily be so large that
 1583                  * it is effectively infinite in some congested and
 1584                  * misconfigured cases, and this is necessary.  Normally
 1585                  * it is about 8 to 100 (pages), which is quite large.
 1586                  */
 1587                 trigger = vm_cnt.v_page_count * 2 / usevnodes;
 1588                 if (force < 2)
 1589                         trigger = vsmalltrigger;
 1590                 reclaim_nc_src = force >= 3;
 1591                 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
 1592                 target = target / 10 + 1;
 1593                 done = vlrureclaim(reclaim_nc_src, trigger, target);
 1594                 mtx_unlock(&vnode_list_mtx);
 1595                 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
 1596                         uma_reclaim(UMA_RECLAIM_DRAIN);
 1597                 if (done == 0) {
 1598                         if (force == 0 || force == 1) {
 1599                                 force = 2;
 1600                                 continue;
 1601                         }
 1602                         if (force == 2) {
 1603                                 force = 3;
 1604                                 continue;
 1605                         }
 1606                         want_reread = true;
 1607                         force = 0;
 1608                         vnlru_nowhere++;
 1609                         tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 1610                 } else {
 1611                         want_reread = true;
 1612                         kern_yield(PRI_USER);
 1613                 }
 1614         }
 1615 }
 1616 
 1617 static struct kproc_desc vnlru_kp = {
 1618         "vnlru",
 1619         vnlru_proc,
 1620         &vnlruproc
 1621 };
 1622 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
 1623     &vnlru_kp);
 1624 
 1625 /*
 1626  * Routines having to do with the management of the vnode table.
 1627  */
 1628 
 1629 /*
 1630  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
 1631  * before we actually vgone().  This function must be called with the vnode
 1632  * held to prevent the vnode from being returned to the free list midway
 1633  * through vgone().
 1634  */
 1635 static int
 1636 vtryrecycle(struct vnode *vp)
 1637 {
 1638         struct mount *vnmp;
 1639 
 1640         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 1641         VNASSERT(vp->v_holdcnt, vp,
 1642             ("vtryrecycle: Recycling vp %p without a reference.", vp));
 1643         /*
 1644          * This vnode may found and locked via some other list, if so we
 1645          * can't recycle it yet.
 1646          */
 1647         if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 1648                 CTR2(KTR_VFS,
 1649                     "%s: impossible to recycle, vp %p lock is already held",
 1650                     __func__, vp);
 1651                 vdrop_recycle(vp);
 1652                 return (EWOULDBLOCK);
 1653         }
 1654         /*
 1655          * Don't recycle if its filesystem is being suspended.
 1656          */
 1657         if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 1658                 VOP_UNLOCK(vp);
 1659                 CTR2(KTR_VFS,
 1660                     "%s: impossible to recycle, cannot start the write for %p",
 1661                     __func__, vp);
 1662                 vdrop_recycle(vp);
 1663                 return (EBUSY);
 1664         }
 1665         /*
 1666          * If we got this far, we need to acquire the interlock and see if
 1667          * anyone picked up this vnode from another list.  If not, we will
 1668          * mark it with DOOMED via vgonel() so that anyone who does find it
 1669          * will skip over it.
 1670          */
 1671         VI_LOCK(vp);
 1672         if (vp->v_usecount) {
 1673                 VOP_UNLOCK(vp);
 1674                 vdropl_recycle(vp);
 1675                 vn_finished_write(vnmp);
 1676                 CTR2(KTR_VFS,
 1677                     "%s: impossible to recycle, %p is already referenced",
 1678                     __func__, vp);
 1679                 return (EBUSY);
 1680         }
 1681         if (!VN_IS_DOOMED(vp)) {
 1682                 counter_u64_add(recycles_free_count, 1);
 1683                 vgonel(vp);
 1684         }
 1685         VOP_UNLOCK(vp);
 1686         vdropl_recycle(vp);
 1687         vn_finished_write(vnmp);
 1688         return (0);
 1689 }
 1690 
 1691 /*
 1692  * Allocate a new vnode.
 1693  *
 1694  * The operation never returns an error. Returning an error was disabled
 1695  * in r145385 (dated 2005) with the following comment:
 1696  *
 1697  * XXX Not all VFS_VGET/ffs_vget callers check returns.
 1698  *
 1699  * Given the age of this commit (almost 15 years at the time of writing this
 1700  * comment) restoring the ability to fail requires a significant audit of
 1701  * all codepaths.
 1702  *
 1703  * The routine can try to free a vnode or stall for up to 1 second waiting for
 1704  * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
 1705  */
 1706 static u_long vn_alloc_cyclecount;
 1707 
 1708 static struct vnode * __noinline
 1709 vn_alloc_hard(struct mount *mp)
 1710 {
 1711         u_long rnumvnodes, rfreevnodes;
 1712 
 1713         mtx_lock(&vnode_list_mtx);
 1714         rnumvnodes = atomic_load_long(&numvnodes);
 1715         if (rnumvnodes + 1 < desiredvnodes) {
 1716                 vn_alloc_cyclecount = 0;
 1717                 goto alloc;
 1718         }
 1719         rfreevnodes = vnlru_read_freevnodes();
 1720         if (vn_alloc_cyclecount++ >= rfreevnodes) {
 1721                 vn_alloc_cyclecount = 0;
 1722                 vstir = 1;
 1723         }
 1724         /*
 1725          * Grow the vnode cache if it will not be above its target max
 1726          * after growing.  Otherwise, if the free list is nonempty, try
 1727          * to reclaim 1 item from it before growing the cache (possibly
 1728          * above its target max if the reclamation failed or is delayed).
 1729          * Otherwise, wait for some space.  In all cases, schedule
 1730          * vnlru_proc() if we are getting short of space.  The watermarks
 1731          * should be chosen so that we never wait or even reclaim from
 1732          * the free list to below its target minimum.
 1733          */
 1734         if (vnlru_free_locked(1) > 0)
 1735                 goto alloc;
 1736         if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 1737                 /*
 1738                  * Wait for space for a new vnode.
 1739                  */
 1740                 vnlru_kick();
 1741                 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 1742                 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
 1743                     vnlru_read_freevnodes() > 1)
 1744                         vnlru_free_locked(1);
 1745         }
 1746 alloc:
 1747         rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 1748         if (vnlru_under(rnumvnodes, vlowat))
 1749                 vnlru_kick();
 1750         mtx_unlock(&vnode_list_mtx);
 1751         return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 1752 }
 1753 
 1754 static struct vnode *
 1755 vn_alloc(struct mount *mp)
 1756 {
 1757         u_long rnumvnodes;
 1758 
 1759         if (__predict_false(vn_alloc_cyclecount != 0))
 1760                 return (vn_alloc_hard(mp));
 1761         rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 1762         if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
 1763                 atomic_subtract_long(&numvnodes, 1);
 1764                 return (vn_alloc_hard(mp));
 1765         }
 1766 
 1767         return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 1768 }
 1769 
 1770 static void
 1771 vn_free(struct vnode *vp)
 1772 {
 1773 
 1774         atomic_subtract_long(&numvnodes, 1);
 1775         uma_zfree_smr(vnode_zone, vp);
 1776 }
 1777 
 1778 /*
 1779  * Return the next vnode from the free list.
 1780  */
 1781 int
 1782 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 1783     struct vnode **vpp)
 1784 {
 1785         struct vnode *vp;
 1786         struct thread *td;
 1787         struct lock_object *lo;
 1788 
 1789         CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
 1790 
 1791         KASSERT(vops->registered,
 1792             ("%s: not registered vector op %p\n", __func__, vops));
 1793 
 1794         td = curthread;
 1795         if (td->td_vp_reserved != NULL) {
 1796                 vp = td->td_vp_reserved;
 1797                 td->td_vp_reserved = NULL;
 1798         } else {
 1799                 vp = vn_alloc(mp);
 1800         }
 1801         counter_u64_add(vnodes_created, 1);
 1802 
 1803         vn_set_state(vp, VSTATE_UNINITIALIZED);
 1804 
 1805         /*
 1806          * Locks are given the generic name "vnode" when created.
 1807          * Follow the historic practice of using the filesystem
 1808          * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
 1809          *
 1810          * Locks live in a witness group keyed on their name. Thus,
 1811          * when a lock is renamed, it must also move from the witness
 1812          * group of its old name to the witness group of its new name.
 1813          *
 1814          * The change only needs to be made when the vnode moves
 1815          * from one filesystem type to another. We ensure that each
 1816          * filesystem use a single static name pointer for its tag so
 1817          * that we can compare pointers rather than doing a strcmp().
 1818          */
 1819         lo = &vp->v_vnlock->lock_object;
 1820 #ifdef WITNESS
 1821         if (lo->lo_name != tag) {
 1822 #endif
 1823                 lo->lo_name = tag;
 1824 #ifdef WITNESS
 1825                 WITNESS_DESTROY(lo);
 1826                 WITNESS_INIT(lo, tag);
 1827         }
 1828 #endif
 1829         /*
 1830          * By default, don't allow shared locks unless filesystems opt-in.
 1831          */
 1832         vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 1833         /*
 1834          * Finalize various vnode identity bits.
 1835          */
 1836         KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
 1837         KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
 1838         KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 1839         vp->v_type = VNON;
 1840         vp->v_op = vops;
 1841         vp->v_irflag = 0;
 1842         v_init_counters(vp);
 1843         vn_seqc_init(vp);
 1844         vp->v_bufobj.bo_ops = &buf_ops_bio;
 1845 #ifdef DIAGNOSTIC
 1846         if (mp == NULL && vops != &dead_vnodeops)
 1847                 printf("NULL mp in getnewvnode(9), tag %s\n", tag);
 1848 #endif
 1849 #ifdef MAC
 1850         mac_vnode_init(vp);
 1851         if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 1852                 mac_vnode_associate_singlelabel(mp, vp);
 1853 #endif
 1854         if (mp != NULL) {
 1855                 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 1856         }
 1857 
 1858         /*
 1859          * For the filesystems which do not use vfs_hash_insert(),
 1860          * still initialize v_hash to have vfs_hash_index() useful.
 1861          * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 1862          * its own hashing.
 1863          */
 1864         vp->v_hash = (uintptr_t)vp >> vnsz2log;
 1865 
 1866         *vpp = vp;
 1867         return (0);
 1868 }
 1869 
 1870 void
 1871 getnewvnode_reserve(void)
 1872 {
 1873         struct thread *td;
 1874 
 1875         td = curthread;
 1876         MPASS(td->td_vp_reserved == NULL);
 1877         td->td_vp_reserved = vn_alloc(NULL);
 1878 }
 1879 
 1880 void
 1881 getnewvnode_drop_reserve(void)
 1882 {
 1883         struct thread *td;
 1884 
 1885         td = curthread;
 1886         if (td->td_vp_reserved != NULL) {
 1887                 vn_free(td->td_vp_reserved);
 1888                 td->td_vp_reserved = NULL;
 1889         }
 1890 }
 1891 
 1892 static void __noinline
 1893 freevnode(struct vnode *vp)
 1894 {
 1895         struct bufobj *bo;
 1896 
 1897         /*
 1898          * The vnode has been marked for destruction, so free it.
 1899          *
 1900          * The vnode will be returned to the zone where it will
 1901          * normally remain until it is needed for another vnode. We
 1902          * need to cleanup (or verify that the cleanup has already
 1903          * been done) any residual data left from its current use
 1904          * so as not to contaminate the freshly allocated vnode.
 1905          */
 1906         CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 1907         /*
 1908          * Paired with vgone.
 1909          */
 1910         vn_seqc_write_end_free(vp);
 1911 
 1912         bo = &vp->v_bufobj;
 1913         VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 1914         VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
 1915         VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 1916         VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 1917         VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 1918         VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 1919         VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 1920             ("clean blk trie not empty"));
 1921         VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 1922         VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 1923             ("dirty blk trie not empty"));
 1924         VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
 1925             ("Dangling rangelock waiters"));
 1926         VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
 1927             ("Leaked inactivation"));
 1928         VI_UNLOCK(vp);
 1929         cache_assert_no_entries(vp);
 1930 
 1931 #ifdef MAC
 1932         mac_vnode_destroy(vp);
 1933 #endif
 1934         if (vp->v_pollinfo != NULL) {
 1935                 /*
 1936                  * Use LK_NOWAIT to shut up witness about the lock. We may get
 1937                  * here while having another vnode locked when trying to
 1938                  * satisfy a lookup and needing to recycle.
 1939                  */
 1940                 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT);
 1941                 destroy_vpollinfo(vp->v_pollinfo);
 1942                 VOP_UNLOCK(vp);
 1943                 vp->v_pollinfo = NULL;
 1944         }
 1945         vp->v_mountedhere = NULL;
 1946         vp->v_unpcb = NULL;
 1947         vp->v_rdev = NULL;
 1948         vp->v_fifoinfo = NULL;
 1949         vp->v_iflag = 0;
 1950         vp->v_vflag = 0;
 1951         bo->bo_flag = 0;
 1952         vn_free(vp);
 1953 }
 1954 
 1955 /*
 1956  * Delete from old mount point vnode list, if on one.
 1957  */
 1958 static void
 1959 delmntque(struct vnode *vp)
 1960 {
 1961         struct mount *mp;
 1962 
 1963         VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 1964 
 1965         mp = vp->v_mount;
 1966         MNT_ILOCK(mp);
 1967         VI_LOCK(vp);
 1968         vp->v_mount = NULL;
 1969         VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 1970                 ("bad mount point vnode list size"));
 1971         TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 1972         mp->mnt_nvnodelistsize--;
 1973         MNT_REL(mp);
 1974         MNT_IUNLOCK(mp);
 1975         /*
 1976          * The caller expects the interlock to be still held.
 1977          */
 1978         ASSERT_VI_LOCKED(vp, __func__);
 1979 }
 1980 
 1981 static int
 1982 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr)
 1983 {
 1984 
 1985         KASSERT(vp->v_mount == NULL,
 1986                 ("insmntque: vnode already on per mount vnode list"));
 1987         VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 1988         if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) {
 1989                 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 1990         } else {
 1991                 KASSERT(!dtr,
 1992                     ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup",
 1993                     __func__));
 1994         }
 1995 
 1996         /*
 1997          * We acquire the vnode interlock early to ensure that the
 1998          * vnode cannot be recycled by another process releasing a
 1999          * holdcnt on it before we get it on both the vnode list
 2000          * and the active vnode list. The mount mutex protects only
 2001          * manipulation of the vnode list and the vnode freelist
 2002          * mutex protects only manipulation of the active vnode list.
 2003          * Hence the need to hold the vnode interlock throughout.
 2004          */
 2005         MNT_ILOCK(mp);
 2006         VI_LOCK(vp);
 2007         if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
 2008             ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 2009             mp->mnt_nvnodelistsize == 0)) &&
 2010             (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 2011                 VI_UNLOCK(vp);
 2012                 MNT_IUNLOCK(mp);
 2013                 if (dtr) {
 2014                         vp->v_data = NULL;
 2015                         vp->v_op = &dead_vnodeops;
 2016                         vgone(vp);
 2017                         vput(vp);
 2018                 }
 2019                 return (EBUSY);
 2020         }
 2021         vp->v_mount = mp;
 2022         MNT_REF(mp);
 2023         TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 2024         VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 2025                 ("neg mount point vnode list size"));
 2026         mp->mnt_nvnodelistsize++;
 2027         VI_UNLOCK(vp);
 2028         MNT_IUNLOCK(mp);
 2029         return (0);
 2030 }
 2031 
 2032 /*
 2033  * Insert into list of vnodes for the new mount point, if available.
 2034  * insmntque() reclaims the vnode on insertion failure, insmntque1()
 2035  * leaves handling of the vnode to the caller.
 2036  */
 2037 int
 2038 insmntque(struct vnode *vp, struct mount *mp)
 2039 {
 2040         return (insmntque1_int(vp, mp, true));
 2041 }
 2042 
 2043 int
 2044 insmntque1(struct vnode *vp, struct mount *mp)
 2045 {
 2046         return (insmntque1_int(vp, mp, false));
 2047 }
 2048 
 2049 /*
 2050  * Flush out and invalidate all buffers associated with a bufobj
 2051  * Called with the underlying object locked.
 2052  */
 2053 int
 2054 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 2055 {
 2056         int error;
 2057 
 2058         BO_LOCK(bo);
 2059         if (flags & V_SAVE) {
 2060                 error = bufobj_wwait(bo, slpflag, slptimeo);
 2061                 if (error) {
 2062                         BO_UNLOCK(bo);
 2063                         return (error);
 2064                 }
 2065                 if (bo->bo_dirty.bv_cnt > 0) {
 2066                         BO_UNLOCK(bo);
 2067                         do {
 2068                                 error = BO_SYNC(bo, MNT_WAIT);
 2069                         } while (error == ERELOOKUP);
 2070                         if (error != 0)
 2071                                 return (error);
 2072                         BO_LOCK(bo);
 2073                         if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
 2074                                 BO_UNLOCK(bo);
 2075                                 return (EBUSY);
 2076                         }
 2077                 }
 2078         }
 2079         /*
 2080          * If you alter this loop please notice that interlock is dropped and
 2081          * reacquired in flushbuflist.  Special care is needed to ensure that
 2082          * no race conditions occur from this.
 2083          */
 2084         do {
 2085                 error = flushbuflist(&bo->bo_clean,
 2086                     flags, bo, slpflag, slptimeo);
 2087                 if (error == 0 && !(flags & V_CLEANONLY))
 2088                         error = flushbuflist(&bo->bo_dirty,
 2089                             flags, bo, slpflag, slptimeo);
 2090                 if (error != 0 && error != EAGAIN) {
 2091                         BO_UNLOCK(bo);
 2092                         return (error);
 2093                 }
 2094         } while (error != 0);
 2095 
 2096         /*
 2097          * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 2098          * have write I/O in-progress but if there is a VM object then the
 2099          * VM object can also have read-I/O in-progress.
 2100          */
 2101         do {
 2102                 bufobj_wwait(bo, 0, 0);
 2103                 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
 2104                         BO_UNLOCK(bo);
 2105                         vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
 2106                         BO_LOCK(bo);
 2107                 }
 2108         } while (bo->bo_numoutput > 0);
 2109         BO_UNLOCK(bo);
 2110 
 2111         /*
 2112          * Destroy the copy in the VM cache, too.
 2113          */
 2114         if (bo->bo_object != NULL &&
 2115             (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
 2116                 VM_OBJECT_WLOCK(bo->bo_object);
 2117                 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 2118                     OBJPR_CLEANONLY : 0);
 2119                 VM_OBJECT_WUNLOCK(bo->bo_object);
 2120         }
 2121 
 2122 #ifdef INVARIANTS
 2123         BO_LOCK(bo);
 2124         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
 2125             V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
 2126             bo->bo_clean.bv_cnt > 0))
 2127                 panic("vinvalbuf: flush failed");
 2128         if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
 2129             bo->bo_dirty.bv_cnt > 0)
 2130                 panic("vinvalbuf: flush dirty failed");
 2131         BO_UNLOCK(bo);
 2132 #endif
 2133         return (0);
 2134 }
 2135 
 2136 /*
 2137  * Flush out and invalidate all buffers associated with a vnode.
 2138  * Called with the underlying object locked.
 2139  */
 2140 int
 2141 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 2142 {
 2143 
 2144         CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 2145         ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 2146         if (vp->v_object != NULL && vp->v_object->handle != vp)
 2147                 return (0);
 2148         return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 2149 }
 2150 
 2151 /*
 2152  * Flush out buffers on the specified list.
 2153  *
 2154  */
 2155 static int
 2156 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
 2157     int slptimeo)
 2158 {
 2159         struct buf *bp, *nbp;
 2160         int retval, error;
 2161         daddr_t lblkno;
 2162         b_xflags_t xflags;
 2163 
 2164         ASSERT_BO_WLOCKED(bo);
 2165 
 2166         retval = 0;
 2167         TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 2168                 /*
 2169                  * If we are flushing both V_NORMAL and V_ALT buffers then
 2170                  * do not skip any buffers. If we are flushing only V_NORMAL
 2171                  * buffers then skip buffers marked as BX_ALTDATA. If we are
 2172                  * flushing only V_ALT buffers then skip buffers not marked
 2173                  * as BX_ALTDATA.
 2174                  */
 2175                 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
 2176                    (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
 2177                     ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
 2178                         continue;
 2179                 }
 2180                 if (nbp != NULL) {
 2181                         lblkno = nbp->b_lblkno;
 2182                         xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 2183                 }
 2184                 retval = EAGAIN;
 2185                 error = BUF_TIMELOCK(bp,
 2186                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 2187                     "flushbuf", slpflag, slptimeo);
 2188                 if (error) {
 2189                         BO_LOCK(bo);
 2190                         return (error != ENOLCK ? error : EAGAIN);
 2191                 }
 2192                 KASSERT(bp->b_bufobj == bo,
 2193                     ("bp %p wrong b_bufobj %p should be %p",
 2194                     bp, bp->b_bufobj, bo));
 2195                 /*
 2196                  * XXX Since there are no node locks for NFS, I
 2197                  * believe there is a slight chance that a delayed
 2198                  * write will occur while sleeping just above, so
 2199                  * check for it.
 2200                  */
 2201                 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 2202                     (flags & V_SAVE)) {
 2203                         bremfree(bp);
 2204                         bp->b_flags |= B_ASYNC;
 2205                         bwrite(bp);
 2206                         BO_LOCK(bo);
 2207                         return (EAGAIN);        /* XXX: why not loop ? */
 2208                 }
 2209                 bremfree(bp);
 2210                 bp->b_flags |= (B_INVAL | B_RELBUF);
 2211                 bp->b_flags &= ~B_ASYNC;
 2212                 brelse(bp);
 2213                 BO_LOCK(bo);
 2214                 if (nbp == NULL)
 2215                         break;
 2216                 nbp = gbincore(bo, lblkno);
 2217                 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 2218                     != xflags)
 2219                         break;                  /* nbp invalid */
 2220         }
 2221         return (retval);
 2222 }
 2223 
 2224 int
 2225 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 2226 {
 2227         struct buf *bp;
 2228         int error;
 2229         daddr_t lblkno;
 2230 
 2231         ASSERT_BO_LOCKED(bo);
 2232 
 2233         for (lblkno = startn;;) {
 2234 again:
 2235                 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
 2236                 if (bp == NULL || bp->b_lblkno >= endn ||
 2237                     bp->b_lblkno < startn)
 2238                         break;
 2239                 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 2240                     LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
 2241                 if (error != 0) {
 2242                         BO_RLOCK(bo);
 2243                         if (error == ENOLCK)
 2244                                 goto again;
 2245                         return (error);
 2246                 }
 2247                 KASSERT(bp->b_bufobj == bo,
 2248                     ("bp %p wrong b_bufobj %p should be %p",
 2249                     bp, bp->b_bufobj, bo));
 2250                 lblkno = bp->b_lblkno + 1;
 2251                 if ((bp->b_flags & B_MANAGED) == 0)
 2252                         bremfree(bp);
 2253                 bp->b_flags |= B_RELBUF;
 2254                 /*
 2255                  * In the VMIO case, use the B_NOREUSE flag to hint that the
 2256                  * pages backing each buffer in the range are unlikely to be
 2257                  * reused.  Dirty buffers will have the hint applied once
 2258                  * they've been written.
 2259                  */
 2260                 if ((bp->b_flags & B_VMIO) != 0)
 2261                         bp->b_flags |= B_NOREUSE;
 2262                 brelse(bp);
 2263                 BO_RLOCK(bo);
 2264         }
 2265         return (0);
 2266 }
 2267 
 2268 /*
 2269  * Truncate a file's buffer and pages to a specified length.  This
 2270  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
 2271  * sync activity.
 2272  */
 2273 int
 2274 vtruncbuf(struct vnode *vp, off_t length, int blksize)
 2275 {
 2276         struct buf *bp, *nbp;
 2277         struct bufobj *bo;
 2278         daddr_t startlbn;
 2279 
 2280         CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
 2281             vp, blksize, (uintmax_t)length);
 2282 
 2283         /*
 2284          * Round up to the *next* lbn.
 2285          */
 2286         startlbn = howmany(length, blksize);
 2287 
 2288         ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 2289 
 2290         bo = &vp->v_bufobj;
 2291 restart_unlocked:
 2292         BO_LOCK(bo);
 2293 
 2294         while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
 2295                 ;
 2296 
 2297         if (length > 0) {
 2298 restartsync:
 2299                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 2300                         if (bp->b_lblkno > 0)
 2301                                 continue;
 2302                         /*
 2303                          * Since we hold the vnode lock this should only
 2304                          * fail if we're racing with the buf daemon.
 2305                          */
 2306                         if (BUF_LOCK(bp,
 2307                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 2308                             BO_LOCKPTR(bo)) == ENOLCK)
 2309                                 goto restart_unlocked;
 2310 
 2311                         VNASSERT((bp->b_flags & B_DELWRI), vp,
 2312                             ("buf(%p) on dirty queue without DELWRI", bp));
 2313 
 2314                         bremfree(bp);
 2315                         bawrite(bp);
 2316                         BO_LOCK(bo);
 2317                         goto restartsync;
 2318                 }
 2319         }
 2320 
 2321         bufobj_wwait(bo, 0, 0);
 2322         BO_UNLOCK(bo);
 2323         vnode_pager_setsize(vp, length);
 2324 
 2325         return (0);
 2326 }
 2327 
 2328 /*
 2329  * Invalidate the cached pages of a file's buffer within the range of block
 2330  * numbers [startlbn, endlbn).
 2331  */
 2332 void
 2333 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
 2334     int blksize)
 2335 {
 2336         struct bufobj *bo;
 2337         off_t start, end;
 2338 
 2339         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
 2340 
 2341         start = blksize * startlbn;
 2342         end = blksize * endlbn;
 2343 
 2344         bo = &vp->v_bufobj;
 2345         BO_LOCK(bo);
 2346         MPASS(blksize == bo->bo_bsize);
 2347 
 2348         while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
 2349                 ;
 2350 
 2351         BO_UNLOCK(bo);
 2352         vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
 2353 }
 2354 
 2355 static int
 2356 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 2357     daddr_t startlbn, daddr_t endlbn)
 2358 {
 2359         struct buf *bp, *nbp;
 2360         bool anyfreed;
 2361 
 2362         ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
 2363         ASSERT_BO_LOCKED(bo);
 2364 
 2365         do {
 2366                 anyfreed = false;
 2367                 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 2368                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 2369                                 continue;
 2370                         if (BUF_LOCK(bp,
 2371                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 2372                             BO_LOCKPTR(bo)) == ENOLCK) {
 2373                                 BO_LOCK(bo);
 2374                                 return (EAGAIN);
 2375                         }
 2376 
 2377                         bremfree(bp);
 2378                         bp->b_flags |= B_INVAL | B_RELBUF;
 2379                         bp->b_flags &= ~B_ASYNC;
 2380                         brelse(bp);
 2381                         anyfreed = true;
 2382 
 2383                         BO_LOCK(bo);
 2384                         if (nbp != NULL &&
 2385                             (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 2386                             nbp->b_vp != vp ||
 2387                             (nbp->b_flags & B_DELWRI) != 0))
 2388                                 return (EAGAIN);
 2389                 }
 2390 
 2391                 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 2392                         if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 2393                                 continue;
 2394                         if (BUF_LOCK(bp,
 2395                             LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 2396                             BO_LOCKPTR(bo)) == ENOLCK) {
 2397                                 BO_LOCK(bo);
 2398                                 return (EAGAIN);
 2399                         }
 2400                         bremfree(bp);
 2401                         bp->b_flags |= B_INVAL | B_RELBUF;
 2402                         bp->b_flags &= ~B_ASYNC;
 2403                         brelse(bp);
 2404                         anyfreed = true;
 2405 
 2406                         BO_LOCK(bo);
 2407                         if (nbp != NULL &&
 2408                             (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 2409                             (nbp->b_vp != vp) ||
 2410                             (nbp->b_flags & B_DELWRI) == 0))
 2411                                 return (EAGAIN);
 2412                 }
 2413         } while (anyfreed);
 2414         return (0);
 2415 }
 2416 
 2417 static void
 2418 buf_vlist_remove(struct buf *bp)
 2419 {
 2420         struct bufv *bv;
 2421         b_xflags_t flags;
 2422 
 2423         flags = bp->b_xflags;
 2424 
 2425         KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 2426         ASSERT_BO_WLOCKED(bp->b_bufobj);
 2427         KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
 2428             (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
 2429             ("%s: buffer %p has invalid queue state", __func__, bp));
 2430 
 2431         if ((flags & BX_VNDIRTY) != 0)
 2432                 bv = &bp->b_bufobj->bo_dirty;
 2433         else
 2434                 bv = &bp->b_bufobj->bo_clean;
 2435         BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 2436         TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 2437         bv->bv_cnt--;
 2438         bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 2439 }
 2440 
 2441 /*
 2442  * Add the buffer to the sorted clean or dirty block list.
 2443  *
 2444  * NOTE: xflags is passed as a constant, optimizing this inline function!
 2445  */
 2446 static void
 2447 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 2448 {
 2449         struct bufv *bv;
 2450         struct buf *n;
 2451         int error;
 2452 
 2453         ASSERT_BO_WLOCKED(bo);
 2454         KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
 2455             ("buf_vlist_add: bo %p does not allow bufs", bo));
 2456         KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 2457             ("dead bo %p", bo));
 2458         KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 2459             ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 2460         bp->b_xflags |= xflags;
 2461         if (xflags & BX_VNDIRTY)
 2462                 bv = &bo->bo_dirty;
 2463         else
 2464                 bv = &bo->bo_clean;
 2465 
 2466         /*
 2467          * Keep the list ordered.  Optimize empty list insertion.  Assume
 2468          * we tend to grow at the tail so lookup_le should usually be cheaper
 2469          * than _ge. 
 2470          */
 2471         if (bv->bv_cnt == 0 ||
 2472             bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 2473                 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 2474         else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 2475                 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 2476         else
 2477                 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 2478         error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 2479         if (error)
 2480                 panic("buf_vlist_add:  Preallocated nodes insufficient.");
 2481         bv->bv_cnt++;
 2482 }
 2483 
 2484 /*
 2485  * Look up a buffer using the buffer tries.
 2486  */
 2487 struct buf *
 2488 gbincore(struct bufobj *bo, daddr_t lblkno)
 2489 {
 2490         struct buf *bp;
 2491 
 2492         ASSERT_BO_LOCKED(bo);
 2493         bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 2494         if (bp != NULL)
 2495                 return (bp);
 2496         return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
 2497 }
 2498 
 2499 /*
 2500  * Look up a buf using the buffer tries, without the bufobj lock.  This relies
 2501  * on SMR for safe lookup, and bufs being in a no-free zone to provide type
 2502  * stability of the result.  Like other lockless lookups, the found buf may
 2503  * already be invalid by the time this function returns.
 2504  */
 2505 struct buf *
 2506 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
 2507 {
 2508         struct buf *bp;
 2509 
 2510         ASSERT_BO_UNLOCKED(bo);
 2511         bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
 2512         if (bp != NULL)
 2513                 return (bp);
 2514         return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
 2515 }
 2516 
 2517 /*
 2518  * Associate a buffer with a vnode.
 2519  */
 2520 void
 2521 bgetvp(struct vnode *vp, struct buf *bp)
 2522 {
 2523         struct bufobj *bo;
 2524 
 2525         bo = &vp->v_bufobj;
 2526         ASSERT_BO_WLOCKED(bo);
 2527         VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 2528 
 2529         CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 2530         VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 2531             ("bgetvp: bp already attached! %p", bp));
 2532 
 2533         vhold(vp);
 2534         bp->b_vp = vp;
 2535         bp->b_bufobj = bo;
 2536         /*
 2537          * Insert onto list for new vnode.
 2538          */
 2539         buf_vlist_add(bp, bo, BX_VNCLEAN);
 2540 }
 2541 
 2542 /*
 2543  * Disassociate a buffer from a vnode.
 2544  */
 2545 void
 2546 brelvp(struct buf *bp)
 2547 {
 2548         struct bufobj *bo;
 2549         struct vnode *vp;
 2550 
 2551         CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 2552         KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 2553 
 2554         /*
 2555          * Delete from old vnode list, if on one.
 2556          */
 2557         vp = bp->b_vp;          /* XXX */
 2558         bo = bp->b_bufobj;
 2559         BO_LOCK(bo);
 2560         buf_vlist_remove(bp);
 2561         if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 2562                 bo->bo_flag &= ~BO_ONWORKLST;
 2563                 mtx_lock(&sync_mtx);
 2564                 LIST_REMOVE(bo, bo_synclist);
 2565                 syncer_worklist_len--;
 2566                 mtx_unlock(&sync_mtx);
 2567         }
 2568         bp->b_vp = NULL;
 2569         bp->b_bufobj = NULL;
 2570         BO_UNLOCK(bo);
 2571         vdrop(vp);
 2572 }
 2573 
 2574 /*
 2575  * Add an item to the syncer work queue.
 2576  */
 2577 static void
 2578 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 2579 {
 2580         int slot;
 2581 
 2582         ASSERT_BO_WLOCKED(bo);
 2583 
 2584         mtx_lock(&sync_mtx);
 2585         if (bo->bo_flag & BO_ONWORKLST)
 2586                 LIST_REMOVE(bo, bo_synclist);
 2587         else {
 2588                 bo->bo_flag |= BO_ONWORKLST;
 2589                 syncer_worklist_len++;
 2590         }
 2591 
 2592         if (delay > syncer_maxdelay - 2)
 2593                 delay = syncer_maxdelay - 2;
 2594         slot = (syncer_delayno + delay) & syncer_mask;
 2595 
 2596         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 2597         mtx_unlock(&sync_mtx);
 2598 }
 2599 
 2600 static int
 2601 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 2602 {
 2603         int error, len;
 2604 
 2605         mtx_lock(&sync_mtx);
 2606         len = syncer_worklist_len - sync_vnode_count;
 2607         mtx_unlock(&sync_mtx);
 2608         error = SYSCTL_OUT(req, &len, sizeof(len));
 2609         return (error);
 2610 }
 2611 
 2612 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
 2613     CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
 2614     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 2615 
 2616 static struct proc *updateproc;
 2617 static void sched_sync(void);
 2618 static struct kproc_desc up_kp = {
 2619         "syncer",
 2620         sched_sync,
 2621         &updateproc
 2622 };
 2623 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 2624 
 2625 static int
 2626 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 2627 {
 2628         struct vnode *vp;
 2629         struct mount *mp;
 2630 
 2631         *bo = LIST_FIRST(slp);
 2632         if (*bo == NULL)
 2633                 return (0);
 2634         vp = bo2vnode(*bo);
 2635         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 2636                 return (1);
 2637         /*
 2638          * We use vhold in case the vnode does not
 2639          * successfully sync.  vhold prevents the vnode from
 2640          * going away when we unlock the sync_mtx so that
 2641          * we can acquire the vnode interlock.
 2642          */
 2643         vholdl(vp);
 2644         mtx_unlock(&sync_mtx);
 2645         VI_UNLOCK(vp);
 2646         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 2647                 vdrop(vp);
 2648                 mtx_lock(&sync_mtx);
 2649                 return (*bo == LIST_FIRST(slp));
 2650         }
 2651         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2652         (void) VOP_FSYNC(vp, MNT_LAZY, td);
 2653         VOP_UNLOCK(vp);
 2654         vn_finished_write(mp);
 2655         BO_LOCK(*bo);
 2656         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 2657                 /*
 2658                  * Put us back on the worklist.  The worklist
 2659                  * routine will remove us from our current
 2660                  * position and then add us back in at a later
 2661                  * position.
 2662                  */
 2663                 vn_syncer_add_to_worklist(*bo, syncdelay);
 2664         }
 2665         BO_UNLOCK(*bo);
 2666         vdrop(vp);
 2667         mtx_lock(&sync_mtx);
 2668         return (0);
 2669 }
 2670 
 2671 static int first_printf = 1;
 2672 
 2673 /*
 2674  * System filesystem synchronizer daemon.
 2675  */
 2676 static void
 2677 sched_sync(void)
 2678 {
 2679         struct synclist *next, *slp;
 2680         struct bufobj *bo;
 2681         long starttime;
 2682         struct thread *td = curthread;
 2683         int last_work_seen;
 2684         int net_worklist_len;
 2685         int syncer_final_iter;
 2686         int error;
 2687 
 2688         last_work_seen = 0;
 2689         syncer_final_iter = 0;
 2690         syncer_state = SYNCER_RUNNING;
 2691         starttime = time_uptime;
 2692         td->td_pflags |= TDP_NORUNNINGBUF;
 2693 
 2694         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 2695             SHUTDOWN_PRI_LAST);
 2696 
 2697         mtx_lock(&sync_mtx);
 2698         for (;;) {
 2699                 if (syncer_state == SYNCER_FINAL_DELAY &&
 2700                     syncer_final_iter == 0) {
 2701                         mtx_unlock(&sync_mtx);
 2702                         kproc_suspend_check(td->td_proc);
 2703                         mtx_lock(&sync_mtx);
 2704                 }
 2705                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
 2706                 if (syncer_state != SYNCER_RUNNING &&
 2707                     starttime != time_uptime) {
 2708                         if (first_printf) {
 2709                                 printf("\nSyncing disks, vnodes remaining... ");
 2710                                 first_printf = 0;
 2711                         }
 2712                         printf("%d ", net_worklist_len);
 2713                 }
 2714                 starttime = time_uptime;
 2715 
 2716                 /*
 2717                  * Push files whose dirty time has expired.  Be careful
 2718                  * of interrupt race on slp queue.
 2719                  *
 2720                  * Skip over empty worklist slots when shutting down.
 2721                  */
 2722                 do {
 2723                         slp = &syncer_workitem_pending[syncer_delayno];
 2724                         syncer_delayno += 1;
 2725                         if (syncer_delayno == syncer_maxdelay)
 2726                                 syncer_delayno = 0;
 2727                         next = &syncer_workitem_pending[syncer_delayno];
 2728                         /*
 2729                          * If the worklist has wrapped since the
 2730                          * it was emptied of all but syncer vnodes,
 2731                          * switch to the FINAL_DELAY state and run
 2732                          * for one more second.
 2733                          */
 2734                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
 2735                             net_worklist_len == 0 &&
 2736                             last_work_seen == syncer_delayno) {
 2737                                 syncer_state = SYNCER_FINAL_DELAY;
 2738                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 2739                         }
 2740                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 2741                     syncer_worklist_len > 0);
 2742 
 2743                 /*
 2744                  * Keep track of the last time there was anything
 2745                  * on the worklist other than syncer vnodes.
 2746                  * Return to the SHUTTING_DOWN state if any
 2747                  * new work appears.
 2748                  */
 2749                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 2750                         last_work_seen = syncer_delayno;
 2751                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 2752                         syncer_state = SYNCER_SHUTTING_DOWN;
 2753                 while (!LIST_EMPTY(slp)) {
 2754                         error = sync_vnode(slp, &bo, td);
 2755                         if (error == 1) {
 2756                                 LIST_REMOVE(bo, bo_synclist);
 2757                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
 2758                                 continue;
 2759                         }
 2760 
 2761                         if (first_printf == 0) {
 2762                                 /*
 2763                                  * Drop the sync mutex, because some watchdog
 2764                                  * drivers need to sleep while patting
 2765                                  */
 2766                                 mtx_unlock(&sync_mtx);
 2767                                 wdog_kern_pat(WD_LASTVAL);
 2768                                 mtx_lock(&sync_mtx);
 2769                         }
 2770                 }
 2771                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 2772                         syncer_final_iter--;
 2773                 /*
 2774                  * The variable rushjob allows the kernel to speed up the
 2775                  * processing of the filesystem syncer process. A rushjob
 2776                  * value of N tells the filesystem syncer to process the next
 2777                  * N seconds worth of work on its queue ASAP. Currently rushjob
 2778                  * is used by the soft update code to speed up the filesystem
 2779                  * syncer process when the incore state is getting so far
 2780                  * ahead of the disk that the kernel memory pool is being
 2781                  * threatened with exhaustion.
 2782                  */
 2783                 if (rushjob > 0) {
 2784                         rushjob -= 1;
 2785                         continue;
 2786                 }
 2787                 /*
 2788                  * Just sleep for a short period of time between
 2789                  * iterations when shutting down to allow some I/O
 2790                  * to happen.
 2791                  *
 2792                  * If it has taken us less than a second to process the
 2793                  * current work, then wait. Otherwise start right over
 2794                  * again. We can still lose time if any single round
 2795                  * takes more than two seconds, but it does not really
 2796                  * matter as we are just trying to generally pace the
 2797                  * filesystem activity.
 2798                  */
 2799                 if (syncer_state != SYNCER_RUNNING ||
 2800                     time_uptime == starttime) {
 2801                         thread_lock(td);
 2802                         sched_prio(td, PPAUSE);
 2803                         thread_unlock(td);
 2804                 }
 2805                 if (syncer_state != SYNCER_RUNNING)
 2806                         cv_timedwait(&sync_wakeup, &sync_mtx,
 2807                             hz / SYNCER_SHUTDOWN_SPEEDUP);
 2808                 else if (time_uptime == starttime)
 2809                         cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 2810         }
 2811 }
 2812 
 2813 /*
 2814  * Request the syncer daemon to speed up its work.
 2815  * We never push it to speed up more than half of its
 2816  * normal turn time, otherwise it could take over the cpu.
 2817  */
 2818 int
 2819 speedup_syncer(void)
 2820 {
 2821         int ret = 0;
 2822 
 2823         mtx_lock(&sync_mtx);
 2824         if (rushjob < syncdelay / 2) {
 2825                 rushjob += 1;
 2826                 stat_rush_requests += 1;
 2827                 ret = 1;
 2828         }
 2829         mtx_unlock(&sync_mtx);
 2830         cv_broadcast(&sync_wakeup);
 2831         return (ret);
 2832 }
 2833 
 2834 /*
 2835  * Tell the syncer to speed up its work and run though its work
 2836  * list several times, then tell it to shut down.
 2837  */
 2838 static void
 2839 syncer_shutdown(void *arg, int howto)
 2840 {
 2841 
 2842         if (howto & RB_NOSYNC)
 2843                 return;
 2844         mtx_lock(&sync_mtx);
 2845         syncer_state = SYNCER_SHUTTING_DOWN;
 2846         rushjob = 0;
 2847         mtx_unlock(&sync_mtx);
 2848         cv_broadcast(&sync_wakeup);
 2849         kproc_shutdown(arg, howto);
 2850 }
 2851 
 2852 void
 2853 syncer_suspend(void)
 2854 {
 2855 
 2856         syncer_shutdown(updateproc, 0);
 2857 }
 2858 
 2859 void
 2860 syncer_resume(void)
 2861 {
 2862 
 2863         mtx_lock(&sync_mtx);
 2864         first_printf = 1;
 2865         syncer_state = SYNCER_RUNNING;
 2866         mtx_unlock(&sync_mtx);
 2867         cv_broadcast(&sync_wakeup);
 2868         kproc_resume(updateproc);
 2869 }
 2870 
 2871 /*
 2872  * Move the buffer between the clean and dirty lists of its vnode.
 2873  */
 2874 void
 2875 reassignbuf(struct buf *bp)
 2876 {
 2877         struct vnode *vp;
 2878         struct bufobj *bo;
 2879         int delay;
 2880 #ifdef INVARIANTS
 2881         struct bufv *bv;
 2882 #endif
 2883 
 2884         vp = bp->b_vp;
 2885         bo = bp->b_bufobj;
 2886 
 2887         KASSERT((bp->b_flags & B_PAGING) == 0,
 2888             ("%s: cannot reassign paging buffer %p", __func__, bp));
 2889 
 2890         CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 2891             bp, bp->b_vp, bp->b_flags);
 2892 
 2893         BO_LOCK(bo);
 2894         buf_vlist_remove(bp);
 2895 
 2896         /*
 2897          * If dirty, put on list of dirty buffers; otherwise insert onto list
 2898          * of clean buffers.
 2899          */
 2900         if (bp->b_flags & B_DELWRI) {
 2901                 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 2902                         switch (vp->v_type) {
 2903                         case VDIR:
 2904                                 delay = dirdelay;
 2905                                 break;
 2906                         case VCHR:
 2907                                 delay = metadelay;
 2908                                 break;
 2909                         default:
 2910                                 delay = filedelay;
 2911                         }
 2912                         vn_syncer_add_to_worklist(bo, delay);
 2913                 }
 2914                 buf_vlist_add(bp, bo, BX_VNDIRTY);
 2915         } else {
 2916                 buf_vlist_add(bp, bo, BX_VNCLEAN);
 2917 
 2918                 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 2919                         mtx_lock(&sync_mtx);
 2920                         LIST_REMOVE(bo, bo_synclist);
 2921                         syncer_worklist_len--;
 2922                         mtx_unlock(&sync_mtx);
 2923                         bo->bo_flag &= ~BO_ONWORKLST;
 2924                 }
 2925         }
 2926 #ifdef INVARIANTS
 2927         bv = &bo->bo_clean;
 2928         bp = TAILQ_FIRST(&bv->bv_hd);
 2929         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2930             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2931         bp = TAILQ_LAST(&bv->bv_hd, buflists);
 2932         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2933             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2934         bv = &bo->bo_dirty;
 2935         bp = TAILQ_FIRST(&bv->bv_hd);
 2936         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2937             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2938         bp = TAILQ_LAST(&bv->bv_hd, buflists);
 2939         KASSERT(bp == NULL || bp->b_bufobj == bo,
 2940             ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 2941 #endif
 2942         BO_UNLOCK(bo);
 2943 }
 2944 
 2945 static void
 2946 v_init_counters(struct vnode *vp)
 2947 {
 2948 
 2949         VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
 2950             vp, ("%s called for an initialized vnode", __FUNCTION__));
 2951         ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
 2952 
 2953         refcount_init(&vp->v_holdcnt, 1);
 2954         refcount_init(&vp->v_usecount, 1);
 2955 }
 2956 
 2957 /*
 2958  * Grab a particular vnode from the free list, increment its
 2959  * reference count and lock it.  VIRF_DOOMED is set if the vnode
 2960  * is being destroyed.  Only callers who specify LK_RETRY will
 2961  * see doomed vnodes.  If inactive processing was delayed in
 2962  * vput try to do it here.
 2963  *
 2964  * usecount is manipulated using atomics without holding any locks.
 2965  *
 2966  * holdcnt can be manipulated using atomics without holding any locks,
 2967  * except when transitioning 1<->0, in which case the interlock is held.
 2968  *
 2969  * Consumers which don't guarantee liveness of the vnode can use SMR to
 2970  * try to get a reference. Note this operation can fail since the vnode
 2971  * may be awaiting getting freed by the time they get to it.
 2972  */
 2973 enum vgetstate
 2974 vget_prep_smr(struct vnode *vp)
 2975 {
 2976         enum vgetstate vs;
 2977 
 2978         VFS_SMR_ASSERT_ENTERED();
 2979 
 2980         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 2981                 vs = VGET_USECOUNT;
 2982         } else {
 2983                 if (vhold_smr(vp))
 2984                         vs = VGET_HOLDCNT;
 2985                 else
 2986                         vs = VGET_NONE;
 2987         }
 2988         return (vs);
 2989 }
 2990 
 2991 enum vgetstate
 2992 vget_prep(struct vnode *vp)
 2993 {
 2994         enum vgetstate vs;
 2995 
 2996         if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 2997                 vs = VGET_USECOUNT;
 2998         } else {
 2999                 vhold(vp);
 3000                 vs = VGET_HOLDCNT;
 3001         }
 3002         return (vs);
 3003 }
 3004 
 3005 void
 3006 vget_abort(struct vnode *vp, enum vgetstate vs)
 3007 {
 3008 
 3009         switch (vs) {
 3010         case VGET_USECOUNT:
 3011                 vrele(vp);
 3012                 break;
 3013         case VGET_HOLDCNT:
 3014                 vdrop(vp);
 3015                 break;
 3016         default:
 3017                 __assert_unreachable();
 3018         }
 3019 }
 3020 
 3021 int
 3022 vget(struct vnode *vp, int flags)
 3023 {
 3024         enum vgetstate vs;
 3025 
 3026         vs = vget_prep(vp);
 3027         return (vget_finish(vp, flags, vs));
 3028 }
 3029 
 3030 int
 3031 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
 3032 {
 3033         int error;
 3034 
 3035         if ((flags & LK_INTERLOCK) != 0)
 3036                 ASSERT_VI_LOCKED(vp, __func__);
 3037         else
 3038                 ASSERT_VI_UNLOCKED(vp, __func__);
 3039         VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 3040         VNPASS(vp->v_holdcnt > 0, vp);
 3041         VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 3042 
 3043         error = vn_lock(vp, flags);
 3044         if (__predict_false(error != 0)) {
 3045                 vget_abort(vp, vs);
 3046                 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 3047                     vp);
 3048                 return (error);
 3049         }
 3050 
 3051         vget_finish_ref(vp, vs);
 3052         return (0);
 3053 }
 3054 
 3055 void
 3056 vget_finish_ref(struct vnode *vp, enum vgetstate vs)
 3057 {
 3058         int old;
 3059 
 3060         VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 3061         VNPASS(vp->v_holdcnt > 0, vp);
 3062         VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 3063 
 3064         if (vs == VGET_USECOUNT)
 3065                 return;
 3066 
 3067         /*
 3068          * We hold the vnode. If the usecount is 0 it will be utilized to keep
 3069          * the vnode around. Otherwise someone else lended their hold count and
 3070          * we have to drop ours.
 3071          */
 3072         old = atomic_fetchadd_int(&vp->v_usecount, 1);
 3073         VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
 3074         if (old != 0) {
 3075 #ifdef INVARIANTS
 3076                 old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
 3077                 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
 3078 #else
 3079                 refcount_release(&vp->v_holdcnt);
 3080 #endif
 3081         }
 3082 }
 3083 
 3084 void
 3085 vref(struct vnode *vp)
 3086 {
 3087         enum vgetstate vs;
 3088 
 3089         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3090         vs = vget_prep(vp);
 3091         vget_finish_ref(vp, vs);
 3092 }
 3093 
 3094 void
 3095 vrefact(struct vnode *vp)
 3096 {
 3097 
 3098         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3099 #ifdef INVARIANTS
 3100         int old = atomic_fetchadd_int(&vp->v_usecount, 1);
 3101         VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
 3102 #else
 3103         refcount_acquire(&vp->v_usecount);
 3104 #endif
 3105 }
 3106 
 3107 void
 3108 vlazy(struct vnode *vp)
 3109 {
 3110         struct mount *mp;
 3111 
 3112         VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
 3113 
 3114         if ((vp->v_mflag & VMP_LAZYLIST) != 0)
 3115                 return;
 3116         /*
 3117          * We may get here for inactive routines after the vnode got doomed.
 3118          */
 3119         if (VN_IS_DOOMED(vp))
 3120                 return;
 3121         mp = vp->v_mount;
 3122         mtx_lock(&mp->mnt_listmtx);
 3123         if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
 3124                 vp->v_mflag |= VMP_LAZYLIST;
 3125                 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 3126                 mp->mnt_lazyvnodelistsize++;
 3127         }
 3128         mtx_unlock(&mp->mnt_listmtx);
 3129 }
 3130 
 3131 static void
 3132 vunlazy(struct vnode *vp)
 3133 {
 3134         struct mount *mp;
 3135 
 3136         ASSERT_VI_LOCKED(vp, __func__);
 3137         VNPASS(!VN_IS_DOOMED(vp), vp);
 3138 
 3139         mp = vp->v_mount;
 3140         mtx_lock(&mp->mnt_listmtx);
 3141         VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 3142         /*
 3143          * Don't remove the vnode from the lazy list if another thread
 3144          * has increased the hold count. It may have re-enqueued the
 3145          * vnode to the lazy list and is now responsible for its
 3146          * removal.
 3147          */
 3148         if (vp->v_holdcnt == 0) {
 3149                 vp->v_mflag &= ~VMP_LAZYLIST;
 3150                 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 3151                 mp->mnt_lazyvnodelistsize--;
 3152         }
 3153         mtx_unlock(&mp->mnt_listmtx);
 3154 }
 3155 
 3156 /*
 3157  * This routine is only meant to be called from vgonel prior to dooming
 3158  * the vnode.
 3159  */
 3160 static void
 3161 vunlazy_gone(struct vnode *vp)
 3162 {
 3163         struct mount *mp;
 3164 
 3165         ASSERT_VOP_ELOCKED(vp, __func__);
 3166         ASSERT_VI_LOCKED(vp, __func__);
 3167         VNPASS(!VN_IS_DOOMED(vp), vp);
 3168 
 3169         if (vp->v_mflag & VMP_LAZYLIST) {
 3170                 mp = vp->v_mount;
 3171                 mtx_lock(&mp->mnt_listmtx);
 3172                 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 3173                 vp->v_mflag &= ~VMP_LAZYLIST;
 3174                 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
 3175                 mp->mnt_lazyvnodelistsize--;
 3176                 mtx_unlock(&mp->mnt_listmtx);
 3177         }
 3178 }
 3179 
 3180 static void
 3181 vdefer_inactive(struct vnode *vp)
 3182 {
 3183 
 3184         ASSERT_VI_LOCKED(vp, __func__);
 3185         VNASSERT(vp->v_holdcnt > 0, vp,
 3186             ("%s: vnode without hold count", __func__));
 3187         if (VN_IS_DOOMED(vp)) {
 3188                 vdropl(vp);
 3189                 return;
 3190         }
 3191         if (vp->v_iflag & VI_DEFINACT) {
 3192                 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 3193                 vdropl(vp);
 3194                 return;
 3195         }
 3196         if (vp->v_usecount > 0) {
 3197                 vp->v_iflag &= ~VI_OWEINACT;
 3198                 vdropl(vp);
 3199                 return;
 3200         }
 3201         vlazy(vp);
 3202         vp->v_iflag |= VI_DEFINACT;
 3203         VI_UNLOCK(vp);
 3204         counter_u64_add(deferred_inact, 1);
 3205 }
 3206 
 3207 static void
 3208 vdefer_inactive_unlocked(struct vnode *vp)
 3209 {
 3210 
 3211         VI_LOCK(vp);
 3212         if ((vp->v_iflag & VI_OWEINACT) == 0) {
 3213                 vdropl(vp);
 3214                 return;
 3215         }
 3216         vdefer_inactive(vp);
 3217 }
 3218 
 3219 enum vput_op { VRELE, VPUT, VUNREF };
 3220 
 3221 /*
 3222  * Handle ->v_usecount transitioning to 0.
 3223  *
 3224  * By releasing the last usecount we take ownership of the hold count which
 3225  * provides liveness of the vnode, meaning we have to vdrop.
 3226  *
 3227  * For all vnodes we may need to perform inactive processing. It requires an
 3228  * exclusive lock on the vnode, while it is legal to call here with only a
 3229  * shared lock (or no locks). If locking the vnode in an expected manner fails,
 3230  * inactive processing gets deferred to the syncer.
 3231  *
 3232  * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
 3233  * on the lock being held all the way until VOP_INACTIVE. This in particular
 3234  * happens with UFS which adds half-constructed vnodes to the hash, where they
 3235  * can be found by other code.
 3236  */
 3237 static void
 3238 vput_final(struct vnode *vp, enum vput_op func)
 3239 {
 3240         int error;
 3241         bool want_unlock;
 3242 
 3243         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3244         VNPASS(vp->v_holdcnt > 0, vp);
 3245 
 3246         VI_LOCK(vp);
 3247 
 3248         /*
 3249          * By the time we got here someone else might have transitioned
 3250          * the count back to > 0.
 3251          */
 3252         if (vp->v_usecount > 0)
 3253                 goto out;
 3254 
 3255         /*
 3256          * If the vnode is doomed vgone already performed inactive processing
 3257          * (if needed).
 3258          */
 3259         if (VN_IS_DOOMED(vp))
 3260                 goto out;
 3261 
 3262         if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
 3263                 goto out;
 3264 
 3265         if (vp->v_iflag & VI_DOINGINACT)
 3266                 goto out;
 3267 
 3268         /*
 3269          * Locking operations here will drop the interlock and possibly the
 3270          * vnode lock, opening a window where the vnode can get doomed all the
 3271          * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
 3272          * perform inactive.
 3273          */
 3274         vp->v_iflag |= VI_OWEINACT;
 3275         want_unlock = false;
 3276         error = 0;
 3277         switch (func) {
 3278         case VRELE:
 3279                 switch (VOP_ISLOCKED(vp)) {
 3280                 case LK_EXCLUSIVE:
 3281                         break;
 3282                 case LK_EXCLOTHER:
 3283                 case 0:
 3284                         want_unlock = true;
 3285                         error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 3286                         VI_LOCK(vp);
 3287                         break;
 3288                 default:
 3289                         /*
 3290                          * The lock has at least one sharer, but we have no way
 3291                          * to conclude whether this is us. Play it safe and
 3292                          * defer processing.
 3293                          */
 3294                         error = EAGAIN;
 3295                         break;
 3296                 }
 3297                 break;
 3298         case VPUT:
 3299                 want_unlock = true;
 3300                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 3301                         error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
 3302                             LK_NOWAIT);
 3303                         VI_LOCK(vp);
 3304                 }
 3305                 break;
 3306         case VUNREF:
 3307                 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 3308                         error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 3309                         VI_LOCK(vp);
 3310                 }
 3311                 break;
 3312         }
 3313         if (error == 0) {
 3314                 if (func == VUNREF) {
 3315                         VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
 3316                             ("recursive vunref"));
 3317                         vp->v_vflag |= VV_UNREF;
 3318                 }
 3319                 for (;;) {
 3320                         error = vinactive(vp);
 3321                         if (want_unlock)
 3322                                 VOP_UNLOCK(vp);
 3323                         if (error != ERELOOKUP || !want_unlock)
 3324                                 break;
 3325                         VOP_LOCK(vp, LK_EXCLUSIVE);
 3326                 }
 3327                 if (func == VUNREF)
 3328                         vp->v_vflag &= ~VV_UNREF;
 3329                 vdropl(vp);
 3330         } else {
 3331                 vdefer_inactive(vp);
 3332         }
 3333         return;
 3334 out:
 3335         if (func == VPUT)
 3336                 VOP_UNLOCK(vp);
 3337         vdropl(vp);
 3338 }
 3339 
 3340 /*
 3341  * Decrement ->v_usecount for a vnode.
 3342  *
 3343  * Releasing the last use count requires additional processing, see vput_final
 3344  * above for details.
 3345  *
 3346  * Comment above each variant denotes lock state on entry and exit.
 3347  */
 3348 
 3349 /*
 3350  * in: any
 3351  * out: same as passed in
 3352  */
 3353 void
 3354 vrele(struct vnode *vp)
 3355 {
 3356 
 3357         ASSERT_VI_UNLOCKED(vp, __func__);
 3358         if (!refcount_release(&vp->v_usecount))
 3359                 return;
 3360         vput_final(vp, VRELE);
 3361 }
 3362 
 3363 /*
 3364  * in: locked
 3365  * out: unlocked
 3366  */
 3367 void
 3368 vput(struct vnode *vp)
 3369 {
 3370 
 3371         ASSERT_VOP_LOCKED(vp, __func__);
 3372         ASSERT_VI_UNLOCKED(vp, __func__);
 3373         if (!refcount_release(&vp->v_usecount)) {
 3374                 VOP_UNLOCK(vp);
 3375                 return;
 3376         }
 3377         vput_final(vp, VPUT);
 3378 }
 3379 
 3380 /*
 3381  * in: locked
 3382  * out: locked
 3383  */
 3384 void
 3385 vunref(struct vnode *vp)
 3386 {
 3387 
 3388         ASSERT_VOP_LOCKED(vp, __func__);
 3389         ASSERT_VI_UNLOCKED(vp, __func__);
 3390         if (!refcount_release(&vp->v_usecount))
 3391                 return;
 3392         vput_final(vp, VUNREF);
 3393 }
 3394 
 3395 void
 3396 vhold(struct vnode *vp)
 3397 {
 3398         int old;
 3399 
 3400         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3401         old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 3402         VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 3403             ("%s: wrong hold count %d", __func__, old));
 3404         if (old == 0)
 3405                 vfs_freevnodes_dec();
 3406 }
 3407 
 3408 void
 3409 vholdnz(struct vnode *vp)
 3410 {
 3411 
 3412         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3413 #ifdef INVARIANTS
 3414         int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 3415         VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 3416             ("%s: wrong hold count %d", __func__, old));
 3417 #else
 3418         atomic_add_int(&vp->v_holdcnt, 1);
 3419 #endif
 3420 }
 3421 
 3422 /*
 3423  * Grab a hold count unless the vnode is freed.
 3424  *
 3425  * Only use this routine if vfs smr is the only protection you have against
 3426  * freeing the vnode.
 3427  *
 3428  * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
 3429  * is not set.  After the flag is set the vnode becomes immutable to anyone but
 3430  * the thread which managed to set the flag.
 3431  *
 3432  * It may be tempting to replace the loop with:
 3433  * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 3434  * if (count & VHOLD_NO_SMR) {
 3435  *     backpedal and error out;
 3436  * }
 3437  *
 3438  * However, while this is more performant, it hinders debugging by eliminating
 3439  * the previously mentioned invariant.
 3440  */
 3441 bool
 3442 vhold_smr(struct vnode *vp)
 3443 {
 3444         int count;
 3445 
 3446         VFS_SMR_ASSERT_ENTERED();
 3447 
 3448         count = atomic_load_int(&vp->v_holdcnt);
 3449         for (;;) {
 3450                 if (count & VHOLD_NO_SMR) {
 3451                         VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 3452                             ("non-zero hold count with flags %d\n", count));
 3453                         return (false);
 3454                 }
 3455                 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 3456                 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 3457                         if (count == 0)
 3458                                 vfs_freevnodes_dec();
 3459                         return (true);
 3460                 }
 3461         }
 3462 }
 3463 
 3464 /*
 3465  * Hold a free vnode for recycling.
 3466  *
 3467  * Note: vnode_init references this comment.
 3468  *
 3469  * Attempts to recycle only need the global vnode list lock and have no use for
 3470  * SMR.
 3471  *
 3472  * However, vnodes get inserted into the global list before they get fully
 3473  * initialized and stay there until UMA decides to free the memory. This in
 3474  * particular means the target can be found before it becomes usable and after
 3475  * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to
 3476  * VHOLD_NO_SMR.
 3477  *
 3478  * Note: the vnode may gain more references after we transition the count 0->1.
 3479  */
 3480 static bool
 3481 vhold_recycle_free(struct vnode *vp)
 3482 {
 3483         int count;
 3484 
 3485         mtx_assert(&vnode_list_mtx, MA_OWNED);
 3486 
 3487         count = atomic_load_int(&vp->v_holdcnt);
 3488         for (;;) {
 3489                 if (count & VHOLD_NO_SMR) {
 3490                         VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 3491                             ("non-zero hold count with flags %d\n", count));
 3492                         return (false);
 3493                 }
 3494                 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 3495                 if (count > 0) {
 3496                         return (false);
 3497                 }
 3498                 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
 3499                         vfs_freevnodes_dec();
 3500                         return (true);
 3501                 }
 3502         }
 3503 }
 3504 
 3505 static void __noinline
 3506 vdbatch_process(struct vdbatch *vd)
 3507 {
 3508         struct vnode *vp;
 3509         int i;
 3510 
 3511         mtx_assert(&vd->lock, MA_OWNED);
 3512         MPASS(curthread->td_pinned > 0);
 3513         MPASS(vd->index == VDBATCH_SIZE);
 3514 
 3515         mtx_lock(&vnode_list_mtx);
 3516         critical_enter();
 3517         freevnodes += vd->freevnodes;
 3518         for (i = 0; i < VDBATCH_SIZE; i++) {
 3519                 vp = vd->tab[i];
 3520                 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
 3521                 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
 3522                 MPASS(vp->v_dbatchcpu != NOCPU);
 3523                 vp->v_dbatchcpu = NOCPU;
 3524         }
 3525         mtx_unlock(&vnode_list_mtx);
 3526         vd->freevnodes = 0;
 3527         bzero(vd->tab, sizeof(vd->tab));
 3528         vd->index = 0;
 3529         critical_exit();
 3530 }
 3531 
 3532 static void
 3533 vdbatch_enqueue(struct vnode *vp)
 3534 {
 3535         struct vdbatch *vd;
 3536 
 3537         ASSERT_VI_LOCKED(vp, __func__);
 3538         VNASSERT(!VN_IS_DOOMED(vp), vp,
 3539             ("%s: deferring requeue of a doomed vnode", __func__));
 3540 
 3541         if (vp->v_dbatchcpu != NOCPU) {
 3542                 VI_UNLOCK(vp);
 3543                 return;
 3544         }
 3545 
 3546         sched_pin();
 3547         vd = DPCPU_PTR(vd);
 3548         mtx_lock(&vd->lock);
 3549         MPASS(vd->index < VDBATCH_SIZE);
 3550         MPASS(vd->tab[vd->index] == NULL);
 3551         /*
 3552          * A hack: we depend on being pinned so that we know what to put in
 3553          * ->v_dbatchcpu.
 3554          */
 3555         vp->v_dbatchcpu = curcpu;
 3556         vd->tab[vd->index] = vp;
 3557         vd->index++;
 3558         VI_UNLOCK(vp);
 3559         if (vd->index == VDBATCH_SIZE)
 3560                 vdbatch_process(vd);
 3561         mtx_unlock(&vd->lock);
 3562         sched_unpin();
 3563 }
 3564 
 3565 /*
 3566  * This routine must only be called for vnodes which are about to be
 3567  * deallocated. Supporting dequeue for arbitrary vndoes would require
 3568  * validating that the locked batch matches.
 3569  */
 3570 static void
 3571 vdbatch_dequeue(struct vnode *vp)
 3572 {
 3573         struct vdbatch *vd;
 3574         int i;
 3575         short cpu;
 3576 
 3577         VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
 3578             ("%s: called for a used vnode\n", __func__));
 3579 
 3580         cpu = vp->v_dbatchcpu;
 3581         if (cpu == NOCPU)
 3582                 return;
 3583 
 3584         vd = DPCPU_ID_PTR(cpu, vd);
 3585         mtx_lock(&vd->lock);
 3586         for (i = 0; i < vd->index; i++) {
 3587                 if (vd->tab[i] != vp)
 3588                         continue;
 3589                 vp->v_dbatchcpu = NOCPU;
 3590                 vd->index--;
 3591                 vd->tab[i] = vd->tab[vd->index];
 3592                 vd->tab[vd->index] = NULL;
 3593                 break;
 3594         }
 3595         mtx_unlock(&vd->lock);
 3596         /*
 3597          * Either we dequeued the vnode above or the target CPU beat us to it.
 3598          */
 3599         MPASS(vp->v_dbatchcpu == NOCPU);
 3600 }
 3601 
 3602 /*
 3603  * Drop the hold count of the vnode.  If this is the last reference to
 3604  * the vnode we place it on the free list unless it has been vgone'd
 3605  * (marked VIRF_DOOMED) in which case we will free it.
 3606  *
 3607  * Because the vnode vm object keeps a hold reference on the vnode if
 3608  * there is at least one resident non-cached page, the vnode cannot
 3609  * leave the active list without the page cleanup done.
 3610  */
 3611 static void __noinline
 3612 vdropl_final(struct vnode *vp)
 3613 {
 3614 
 3615         ASSERT_VI_LOCKED(vp, __func__);
 3616         VNPASS(VN_IS_DOOMED(vp), vp);
 3617         /*
 3618          * Set the VHOLD_NO_SMR flag.
 3619          *
 3620          * We may be racing against vhold_smr. If they win we can just pretend
 3621          * we never got this far, they will vdrop later.
 3622          */
 3623         if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
 3624                 vfs_freevnodes_inc();
 3625                 VI_UNLOCK(vp);
 3626                 /*
 3627                  * We lost the aforementioned race. Any subsequent access is
 3628                  * invalid as they might have managed to vdropl on their own.
 3629                  */
 3630                 return;
 3631         }
 3632         /*
 3633          * Don't bump freevnodes as this one is going away.
 3634          */
 3635         freevnode(vp);
 3636 }
 3637 
 3638 void
 3639 vdrop(struct vnode *vp)
 3640 {
 3641 
 3642         ASSERT_VI_UNLOCKED(vp, __func__);
 3643         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3644         if (refcount_release_if_not_last(&vp->v_holdcnt))
 3645                 return;
 3646         VI_LOCK(vp);
 3647         vdropl(vp);
 3648 }
 3649 
 3650 static void __always_inline
 3651 vdropl_impl(struct vnode *vp, bool enqueue)
 3652 {
 3653 
 3654         ASSERT_VI_LOCKED(vp, __func__);
 3655         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3656         if (!refcount_release(&vp->v_holdcnt)) {
 3657                 VI_UNLOCK(vp);
 3658                 return;
 3659         }
 3660         VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp);
 3661         VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
 3662         if (VN_IS_DOOMED(vp)) {
 3663                 vdropl_final(vp);
 3664                 return;
 3665         }
 3666 
 3667         vfs_freevnodes_inc();
 3668         if (vp->v_mflag & VMP_LAZYLIST) {
 3669                 vunlazy(vp);
 3670         }
 3671 
 3672         if (!enqueue) {
 3673                 VI_UNLOCK(vp);
 3674                 return;
 3675         }
 3676 
 3677         /*
 3678          * Also unlocks the interlock. We can't assert on it as we
 3679          * released our hold and by now the vnode might have been
 3680          * freed.
 3681          */
 3682         vdbatch_enqueue(vp);
 3683 }
 3684 
 3685 void
 3686 vdropl(struct vnode *vp)
 3687 {
 3688 
 3689         vdropl_impl(vp, true);
 3690 }
 3691 
 3692 /*
 3693  * vdrop a vnode when recycling
 3694  *
 3695  * This is a special case routine only to be used when recycling, differs from
 3696  * regular vdrop by not requeieing the vnode on LRU.
 3697  *
 3698  * Consider a case where vtryrecycle continuously fails with all vnodes (due to
 3699  * e.g., frozen writes on the filesystem), filling the batch and causing it to
 3700  * be requeued. Then vnlru will end up revisiting the same vnodes. This is a
 3701  * loop which can last for as long as writes are frozen.
 3702  */
 3703 static void
 3704 vdropl_recycle(struct vnode *vp)
 3705 {
 3706 
 3707         vdropl_impl(vp, false);
 3708 }
 3709 
 3710 static void
 3711 vdrop_recycle(struct vnode *vp)
 3712 {
 3713 
 3714         VI_LOCK(vp);
 3715         vdropl_recycle(vp);
 3716 }
 3717 
 3718 /*
 3719  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
 3720  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
 3721  */
 3722 static int
 3723 vinactivef(struct vnode *vp)
 3724 {
 3725         struct vm_object *obj;
 3726         int error;
 3727 
 3728         ASSERT_VOP_ELOCKED(vp, "vinactive");
 3729         ASSERT_VI_LOCKED(vp, "vinactive");
 3730         VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 3731             ("vinactive: recursed on VI_DOINGINACT"));
 3732         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3733         vp->v_iflag |= VI_DOINGINACT;
 3734         vp->v_iflag &= ~VI_OWEINACT;
 3735         VI_UNLOCK(vp);
 3736         /*
 3737          * Before moving off the active list, we must be sure that any
 3738          * modified pages are converted into the vnode's dirty
 3739          * buffers, since these will no longer be checked once the
 3740          * vnode is on the inactive list.
 3741          *
 3742          * The write-out of the dirty pages is asynchronous.  At the
 3743          * point that VOP_INACTIVE() is called, there could still be
 3744          * pending I/O and dirty pages in the object.
 3745          */
 3746         if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 3747             vm_object_mightbedirty(obj)) {
 3748                 VM_OBJECT_WLOCK(obj);
 3749                 vm_object_page_clean(obj, 0, 0, 0);
 3750                 VM_OBJECT_WUNLOCK(obj);
 3751         }
 3752         error = VOP_INACTIVE(vp);
 3753         VI_LOCK(vp);
 3754         VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 3755             ("vinactive: lost VI_DOINGINACT"));
 3756         vp->v_iflag &= ~VI_DOINGINACT;
 3757         return (error);
 3758 }
 3759 
 3760 int
 3761 vinactive(struct vnode *vp)
 3762 {
 3763 
 3764         ASSERT_VOP_ELOCKED(vp, "vinactive");
 3765         ASSERT_VI_LOCKED(vp, "vinactive");
 3766         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3767 
 3768         if ((vp->v_iflag & VI_OWEINACT) == 0)
 3769                 return (0);
 3770         if (vp->v_iflag & VI_DOINGINACT)
 3771                 return (0);
 3772         if (vp->v_usecount > 0) {
 3773                 vp->v_iflag &= ~VI_OWEINACT;
 3774                 return (0);
 3775         }
 3776         return (vinactivef(vp));
 3777 }
 3778 
 3779 /*
 3780  * Remove any vnodes in the vnode table belonging to mount point mp.
 3781  *
 3782  * If FORCECLOSE is not specified, there should not be any active ones,
 3783  * return error if any are found (nb: this is a user error, not a
 3784  * system error). If FORCECLOSE is specified, detach any active vnodes
 3785  * that are found.
 3786  *
 3787  * If WRITECLOSE is set, only flush out regular file vnodes open for
 3788  * writing.
 3789  *
 3790  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
 3791  *
 3792  * `rootrefs' specifies the base reference count for the root vnode
 3793  * of this filesystem. The root vnode is considered busy if its
 3794  * v_usecount exceeds this value. On a successful return, vflush(, td)
 3795  * will call vrele() on the root vnode exactly rootrefs times.
 3796  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
 3797  * be zero.
 3798  */
 3799 #ifdef DIAGNOSTIC
 3800 static int busyprt = 0;         /* print out busy vnodes */
 3801 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 3802 #endif
 3803 
 3804 int
 3805 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 3806 {
 3807         struct vnode *vp, *mvp, *rootvp = NULL;
 3808         struct vattr vattr;
 3809         int busy = 0, error;
 3810 
 3811         CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 3812             rootrefs, flags);
 3813         if (rootrefs > 0) {
 3814                 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 3815                     ("vflush: bad args"));
 3816                 /*
 3817                  * Get the filesystem root vnode. We can vput() it
 3818                  * immediately, since with rootrefs > 0, it won't go away.
 3819                  */
 3820                 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 3821                         CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 3822                             __func__, error);
 3823                         return (error);
 3824                 }
 3825                 vput(rootvp);
 3826         }
 3827 loop:
 3828         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 3829                 vholdl(vp);
 3830                 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 3831                 if (error) {
 3832                         vdrop(vp);
 3833                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 3834                         goto loop;
 3835                 }
 3836                 /*
 3837                  * Skip over a vnodes marked VV_SYSTEM.
 3838                  */
 3839                 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 3840                         VOP_UNLOCK(vp);
 3841                         vdrop(vp);
 3842                         continue;
 3843                 }
 3844                 /*
 3845                  * If WRITECLOSE is set, flush out unlinked but still open
 3846                  * files (even if open only for reading) and regular file
 3847                  * vnodes open for writing.
 3848                  */
 3849                 if (flags & WRITECLOSE) {
 3850                         if (vp->v_object != NULL) {
 3851                                 VM_OBJECT_WLOCK(vp->v_object);
 3852                                 vm_object_page_clean(vp->v_object, 0, 0, 0);
 3853                                 VM_OBJECT_WUNLOCK(vp->v_object);
 3854                         }
 3855                         do {
 3856                                 error = VOP_FSYNC(vp, MNT_WAIT, td);
 3857                         } while (error == ERELOOKUP);
 3858                         if (error != 0) {
 3859                                 VOP_UNLOCK(vp);
 3860                                 vdrop(vp);
 3861                                 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 3862                                 return (error);
 3863                         }
 3864                         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 3865                         VI_LOCK(vp);
 3866 
 3867                         if ((vp->v_type == VNON ||
 3868                             (error == 0 && vattr.va_nlink > 0)) &&
 3869                             (vp->v_writecount <= 0 || vp->v_type != VREG)) {
 3870                                 VOP_UNLOCK(vp);
 3871                                 vdropl(vp);
 3872                                 continue;
 3873                         }
 3874                 } else
 3875                         VI_LOCK(vp);
 3876                 /*
 3877                  * With v_usecount == 0, all we need to do is clear out the
 3878                  * vnode data structures and we are done.
 3879                  *
 3880                  * If FORCECLOSE is set, forcibly close the vnode.
 3881                  */
 3882                 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 3883                         vgonel(vp);
 3884                 } else {
 3885                         busy++;
 3886 #ifdef DIAGNOSTIC
 3887                         if (busyprt)
 3888                                 vn_printf(vp, "vflush: busy vnode ");
 3889 #endif
 3890                 }
 3891                 VOP_UNLOCK(vp);
 3892                 vdropl(vp);
 3893         }
 3894         if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 3895                 /*
 3896                  * If just the root vnode is busy, and if its refcount
 3897                  * is equal to `rootrefs', then go ahead and kill it.
 3898                  */
 3899                 VI_LOCK(rootvp);
 3900                 KASSERT(busy > 0, ("vflush: not busy"));
 3901                 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 3902                     ("vflush: usecount %d < rootrefs %d",
 3903                      rootvp->v_usecount, rootrefs));
 3904                 if (busy == 1 && rootvp->v_usecount == rootrefs) {
 3905                         VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 3906                         vgone(rootvp);
 3907                         VOP_UNLOCK(rootvp);
 3908                         busy = 0;
 3909                 } else
 3910                         VI_UNLOCK(rootvp);
 3911         }
 3912         if (busy) {
 3913                 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 3914                     busy);
 3915                 return (EBUSY);
 3916         }
 3917         for (; rootrefs > 0; rootrefs--)
 3918                 vrele(rootvp);
 3919         return (0);
 3920 }
 3921 
 3922 /*
 3923  * Recycle an unused vnode to the front of the free list.
 3924  */
 3925 int
 3926 vrecycle(struct vnode *vp)
 3927 {
 3928         int recycled;
 3929 
 3930         VI_LOCK(vp);
 3931         recycled = vrecyclel(vp);
 3932         VI_UNLOCK(vp);
 3933         return (recycled);
 3934 }
 3935 
 3936 /*
 3937  * vrecycle, with the vp interlock held.
 3938  */
 3939 int
 3940 vrecyclel(struct vnode *vp)
 3941 {
 3942         int recycled;
 3943 
 3944         ASSERT_VOP_ELOCKED(vp, __func__);
 3945         ASSERT_VI_LOCKED(vp, __func__);
 3946         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 3947         recycled = 0;
 3948         if (vp->v_usecount == 0) {
 3949                 recycled = 1;
 3950                 vgonel(vp);
 3951         }
 3952         return (recycled);
 3953 }
 3954 
 3955 /*
 3956  * Eliminate all activity associated with a vnode
 3957  * in preparation for reuse.
 3958  */
 3959 void
 3960 vgone(struct vnode *vp)
 3961 {
 3962         VI_LOCK(vp);
 3963         vgonel(vp);
 3964         VI_UNLOCK(vp);
 3965 }
 3966 
 3967 /*
 3968  * Notify upper mounts about reclaimed or unlinked vnode.
 3969  */
 3970 void
 3971 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event)
 3972 {
 3973         struct mount *mp;
 3974         struct mount_upper_node *ump;
 3975 
 3976         mp = atomic_load_ptr(&vp->v_mount);
 3977         if (mp == NULL)
 3978                 return;
 3979         if (TAILQ_EMPTY(&mp->mnt_notify))
 3980                 return;
 3981 
 3982         MNT_ILOCK(mp);
 3983         mp->mnt_upper_pending++;
 3984         KASSERT(mp->mnt_upper_pending > 0,
 3985             ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending));
 3986         TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) {
 3987                 MNT_IUNLOCK(mp);
 3988                 switch (event) {
 3989                 case VFS_NOTIFY_UPPER_RECLAIM:
 3990                         VFS_RECLAIM_LOWERVP(ump->mp, vp);
 3991                         break;
 3992                 case VFS_NOTIFY_UPPER_UNLINK:
 3993                         VFS_UNLINK_LOWERVP(ump->mp, vp);
 3994                         break;
 3995                 }
 3996                 MNT_ILOCK(mp);
 3997         }
 3998         mp->mnt_upper_pending--;
 3999         if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
 4000             mp->mnt_upper_pending == 0) {
 4001                 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
 4002                 wakeup(&mp->mnt_uppers);
 4003         }
 4004         MNT_IUNLOCK(mp);
 4005 }
 4006 
 4007 /*
 4008  * vgone, with the vp interlock held.
 4009  */
 4010 static void
 4011 vgonel(struct vnode *vp)
 4012 {
 4013         struct thread *td;
 4014         struct mount *mp;
 4015         vm_object_t object;
 4016         bool active, doinginact, oweinact;
 4017 
 4018         ASSERT_VOP_ELOCKED(vp, "vgonel");
 4019         ASSERT_VI_LOCKED(vp, "vgonel");
 4020         VNASSERT(vp->v_holdcnt, vp,
 4021             ("vgonel: vp %p has no reference.", vp));
 4022         CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 4023         td = curthread;
 4024 
 4025         /*
 4026          * Don't vgonel if we're already doomed.
 4027          */
 4028         if (VN_IS_DOOMED(vp)) {
 4029                 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \
 4030                     vn_get_state(vp) == VSTATE_DEAD, vp);
 4031                 return;
 4032         }
 4033         /*
 4034          * Paired with freevnode.
 4035          */
 4036         vn_seqc_write_begin_locked(vp);
 4037         vunlazy_gone(vp);
 4038         vn_irflag_set_locked(vp, VIRF_DOOMED);
 4039         vn_set_state(vp, VSTATE_DESTROYING);
 4040 
 4041         /*
 4042          * Check to see if the vnode is in use.  If so, we have to
 4043          * call VOP_CLOSE() and VOP_INACTIVE().
 4044          *
 4045          * It could be that VOP_INACTIVE() requested reclamation, in
 4046          * which case we should avoid recursion, so check
 4047          * VI_DOINGINACT.  This is not precise but good enough.
 4048          */
 4049         active = vp->v_usecount > 0;
 4050         oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 4051         doinginact = (vp->v_iflag & VI_DOINGINACT) != 0;
 4052 
 4053         /*
 4054          * If we need to do inactive VI_OWEINACT will be set.
 4055          */
 4056         if (vp->v_iflag & VI_DEFINACT) {
 4057                 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
 4058                 vp->v_iflag &= ~VI_DEFINACT;
 4059                 vdropl(vp);
 4060         } else {
 4061                 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
 4062                 VI_UNLOCK(vp);
 4063         }
 4064         cache_purge_vgone(vp);
 4065         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 4066 
 4067         /*
 4068          * If purging an active vnode, it must be closed and
 4069          * deactivated before being reclaimed.
 4070          */
 4071         if (active)
 4072                 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 4073         if (!doinginact) {
 4074                 do {
 4075                         if (oweinact || active) {
 4076                                 VI_LOCK(vp);
 4077                                 vinactivef(vp);
 4078                                 oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 4079                                 VI_UNLOCK(vp);
 4080                         }
 4081                 } while (oweinact);
 4082         }
 4083         if (vp->v_type == VSOCK)
 4084                 vfs_unp_reclaim(vp);
 4085 
 4086         /*
 4087          * Clean out any buffers associated with the vnode.
 4088          * If the flush fails, just toss the buffers.
 4089          */
 4090         mp = NULL;
 4091         if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 4092                 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
 4093         if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 4094                 while (vinvalbuf(vp, 0, 0, 0) != 0)
 4095                         ;
 4096         }
 4097 
 4098         BO_LOCK(&vp->v_bufobj);
 4099         KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 4100             vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 4101             TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 4102             vp->v_bufobj.bo_clean.bv_cnt == 0,
 4103             ("vp %p bufobj not invalidated", vp));
 4104 
 4105         /*
 4106          * For VMIO bufobj, BO_DEAD is set later, or in
 4107          * vm_object_terminate() after the object's page queue is
 4108          * flushed.
 4109          */
 4110         object = vp->v_bufobj.bo_object;
 4111         if (object == NULL)
 4112                 vp->v_bufobj.bo_flag |= BO_DEAD;
 4113         BO_UNLOCK(&vp->v_bufobj);
 4114 
 4115         /*
 4116          * Handle the VM part.  Tmpfs handles v_object on its own (the
 4117          * OBJT_VNODE check).  Nullfs or other bypassing filesystems
 4118          * should not touch the object borrowed from the lower vnode
 4119          * (the handle check).
 4120          */
 4121         if (object != NULL && object->type == OBJT_VNODE &&
 4122             object->handle == vp)
 4123                 vnode_destroy_vobject(vp);
 4124 
 4125         /*
 4126          * Reclaim the vnode.
 4127          */
 4128         if (VOP_RECLAIM(vp))
 4129                 panic("vgone: cannot reclaim");
 4130         if (mp != NULL)
 4131                 vn_finished_secondary_write(mp);
 4132         VNASSERT(vp->v_object == NULL, vp,
 4133             ("vop_reclaim left v_object vp=%p", vp));
 4134         /*
 4135          * Clear the advisory locks and wake up waiting threads.
 4136          */
 4137         if (vp->v_lockf != NULL) {
 4138                 (void)VOP_ADVLOCKPURGE(vp);
 4139                 vp->v_lockf = NULL;
 4140         }
 4141         /*
 4142          * Delete from old mount point vnode list.
 4143          */
 4144         if (vp->v_mount == NULL) {
 4145                 VI_LOCK(vp);
 4146         } else {
 4147                 delmntque(vp);
 4148                 ASSERT_VI_LOCKED(vp, "vgonel 2");
 4149         }
 4150         /*
 4151          * Done with purge, reset to the standard lock and invalidate
 4152          * the vnode.
 4153          */
 4154         vp->v_vnlock = &vp->v_lock;
 4155         vp->v_op = &dead_vnodeops;
 4156         vp->v_type = VBAD;
 4157         vn_set_state(vp, VSTATE_DEAD);
 4158 }
 4159 
 4160 /*
 4161  * Print out a description of a vnode.
 4162  */
 4163 static const char *const vtypename[] = {
 4164         [VNON] = "VNON",
 4165         [VREG] = "VREG",
 4166         [VDIR] = "VDIR",
 4167         [VBLK] = "VBLK",
 4168         [VCHR] = "VCHR",
 4169         [VLNK] = "VLNK",
 4170         [VSOCK] = "VSOCK",
 4171         [VFIFO] = "VFIFO",
 4172         [VBAD] = "VBAD",
 4173         [VMARKER] = "VMARKER",
 4174 };
 4175 _Static_assert(nitems(vtypename) == VLASTTYPE + 1,
 4176     "vnode type name not added to vtypename");
 4177 
 4178 static const char *const vstatename[] = {
 4179         [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED",
 4180         [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED",
 4181         [VSTATE_DESTROYING] = "VSTATE_DESTROYING",
 4182         [VSTATE_DEAD] = "VSTATE_DEAD",
 4183 };
 4184 _Static_assert(nitems(vstatename) == VLASTSTATE + 1,
 4185     "vnode state name not added to vstatename");
 4186 
 4187 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
 4188     "new hold count flag not added to vn_printf");
 4189 
 4190 void
 4191 vn_printf(struct vnode *vp, const char *fmt, ...)
 4192 {
 4193         va_list ap;
 4194         char buf[256], buf2[16];
 4195         u_long flags;
 4196         u_int holdcnt;
 4197         short irflag;
 4198 
 4199         va_start(ap, fmt);
 4200         vprintf(fmt, ap);
 4201         va_end(ap);
 4202         printf("%p: ", (void *)vp);
 4203         printf("type %s state %s\n", vtypename[vp->v_type], vstatename[vp->v_state]);
 4204         holdcnt = atomic_load_int(&vp->v_holdcnt);
 4205         printf("    usecount %d, writecount %d, refcount %d seqc users %d",
 4206             vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
 4207             vp->v_seqc_users);
 4208         switch (vp->v_type) {
 4209         case VDIR:
 4210                 printf(" mountedhere %p\n", vp->v_mountedhere);
 4211                 break;
 4212         case VCHR:
 4213                 printf(" rdev %p\n", vp->v_rdev);
 4214                 break;
 4215         case VSOCK:
 4216                 printf(" socket %p\n", vp->v_unpcb);
 4217                 break;
 4218         case VFIFO:
 4219                 printf(" fifoinfo %p\n", vp->v_fifoinfo);
 4220                 break;
 4221         default:
 4222                 printf("\n");
 4223                 break;
 4224         }
 4225         buf[0] = '\0';
 4226         buf[1] = '\0';
 4227         if (holdcnt & VHOLD_NO_SMR)
 4228                 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
 4229         printf("    hold count flags (%s)\n", buf + 1);
 4230 
 4231         buf[0] = '\0';
 4232         buf[1] = '\0';
 4233         irflag = vn_irflag_read(vp);
 4234         if (irflag & VIRF_DOOMED)
 4235                 strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
 4236         if (irflag & VIRF_PGREAD)
 4237                 strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
 4238         if (irflag & VIRF_MOUNTPOINT)
 4239                 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf));
 4240         if (irflag & VIRF_TEXT_REF)
 4241                 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf));
 4242         flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF);
 4243         if (flags != 0) {
 4244                 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
 4245                 strlcat(buf, buf2, sizeof(buf));
 4246         }
 4247         if (vp->v_vflag & VV_ROOT)
 4248                 strlcat(buf, "|VV_ROOT", sizeof(buf));
 4249         if (vp->v_vflag & VV_ISTTY)
 4250                 strlcat(buf, "|VV_ISTTY", sizeof(buf));
 4251         if (vp->v_vflag & VV_NOSYNC)
 4252                 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 4253         if (vp->v_vflag & VV_ETERNALDEV)
 4254                 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 4255         if (vp->v_vflag & VV_CACHEDLABEL)
 4256                 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 4257         if (vp->v_vflag & VV_VMSIZEVNLOCK)
 4258                 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
 4259         if (vp->v_vflag & VV_COPYONWRITE)
 4260                 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 4261         if (vp->v_vflag & VV_SYSTEM)
 4262                 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 4263         if (vp->v_vflag & VV_PROCDEP)
 4264                 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 4265         if (vp->v_vflag & VV_DELETED)
 4266                 strlcat(buf, "|VV_DELETED", sizeof(buf));
 4267         if (vp->v_vflag & VV_MD)
 4268                 strlcat(buf, "|VV_MD", sizeof(buf));
 4269         if (vp->v_vflag & VV_FORCEINSMQ)
 4270                 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 4271         if (vp->v_vflag & VV_READLINK)
 4272                 strlcat(buf, "|VV_READLINK", sizeof(buf));
 4273         flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 4274             VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM |
 4275             VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK);
 4276         if (flags != 0) {
 4277                 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 4278                 strlcat(buf, buf2, sizeof(buf));
 4279         }
 4280         if (vp->v_iflag & VI_MOUNT)
 4281                 strlcat(buf, "|VI_MOUNT", sizeof(buf));
 4282         if (vp->v_iflag & VI_DOINGINACT)
 4283                 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 4284         if (vp->v_iflag & VI_OWEINACT)
 4285                 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 4286         if (vp->v_iflag & VI_DEFINACT)
 4287                 strlcat(buf, "|VI_DEFINACT", sizeof(buf));
 4288         if (vp->v_iflag & VI_FOPENING)
 4289                 strlcat(buf, "|VI_FOPENING", sizeof(buf));
 4290         flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT |
 4291             VI_OWEINACT | VI_DEFINACT | VI_FOPENING);
 4292         if (flags != 0) {
 4293                 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 4294                 strlcat(buf, buf2, sizeof(buf));
 4295         }
 4296         if (vp->v_mflag & VMP_LAZYLIST)
 4297                 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
 4298         flags = vp->v_mflag & ~(VMP_LAZYLIST);
 4299         if (flags != 0) {
 4300                 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
 4301                 strlcat(buf, buf2, sizeof(buf));
 4302         }
 4303         printf("    flags (%s)", buf + 1);
 4304         if (mtx_owned(VI_MTX(vp)))
 4305                 printf(" VI_LOCKed");
 4306         printf("\n");
 4307         if (vp->v_object != NULL)
 4308                 printf("    v_object %p ref %d pages %d "
 4309                     "cleanbuf %d dirtybuf %d\n",
 4310                     vp->v_object, vp->v_object->ref_count,
 4311                     vp->v_object->resident_page_count,
 4312                     vp->v_bufobj.bo_clean.bv_cnt,
 4313                     vp->v_bufobj.bo_dirty.bv_cnt);
 4314         printf("    ");
 4315         lockmgr_printinfo(vp->v_vnlock);
 4316         if (vp->v_data != NULL)
 4317                 VOP_PRINT(vp);
 4318 }
 4319 
 4320 #ifdef DDB
 4321 /*
 4322  * List all of the locked vnodes in the system.
 4323  * Called when debugging the kernel.
 4324  */
 4325 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE)
 4326 {
 4327         struct mount *mp;
 4328         struct vnode *vp;
 4329 
 4330         /*
 4331          * Note: because this is DDB, we can't obey the locking semantics
 4332          * for these structures, which means we could catch an inconsistent
 4333          * state and dereference a nasty pointer.  Not much to be done
 4334          * about that.
 4335          */
 4336         db_printf("Locked vnodes\n");
 4337         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 4338                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4339                         if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 4340                                 vn_printf(vp, "vnode ");
 4341                 }
 4342         }
 4343 }
 4344 
 4345 /*
 4346  * Show details about the given vnode.
 4347  */
 4348 DB_SHOW_COMMAND(vnode, db_show_vnode)
 4349 {
 4350         struct vnode *vp;
 4351 
 4352         if (!have_addr)
 4353                 return;
 4354         vp = (struct vnode *)addr;
 4355         vn_printf(vp, "vnode ");
 4356 }
 4357 
 4358 /*
 4359  * Show details about the given mount point.
 4360  */
 4361 DB_SHOW_COMMAND(mount, db_show_mount)
 4362 {
 4363         struct mount *mp;
 4364         struct vfsopt *opt;
 4365         struct statfs *sp;
 4366         struct vnode *vp;
 4367         char buf[512];
 4368         uint64_t mflags;
 4369         u_int flags;
 4370 
 4371         if (!have_addr) {
 4372                 /* No address given, print short info about all mount points. */
 4373                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 4374                         db_printf("%p %s on %s (%s)\n", mp,
 4375                             mp->mnt_stat.f_mntfromname,
 4376                             mp->mnt_stat.f_mntonname,
 4377                             mp->mnt_stat.f_fstypename);
 4378                         if (db_pager_quit)
 4379                                 break;
 4380                 }
 4381                 db_printf("\nMore info: show mount <addr>\n");
 4382                 return;
 4383         }
 4384 
 4385         mp = (struct mount *)addr;
 4386         db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 4387             mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 4388 
 4389         buf[0] = '\0';
 4390         mflags = mp->mnt_flag;
 4391 #define MNT_FLAG(flag)  do {                                            \
 4392         if (mflags & (flag)) {                                          \
 4393                 if (buf[0] != '\0')                                     \
 4394                         strlcat(buf, ", ", sizeof(buf));                \
 4395                 strlcat(buf, (#flag) + 4, sizeof(buf));                 \
 4396                 mflags &= ~(flag);                                      \
 4397         }                                                               \
 4398 } while (0)
 4399         MNT_FLAG(MNT_RDONLY);
 4400         MNT_FLAG(MNT_SYNCHRONOUS);
 4401         MNT_FLAG(MNT_NOEXEC);
 4402         MNT_FLAG(MNT_NOSUID);
 4403         MNT_FLAG(MNT_NFS4ACLS);
 4404         MNT_FLAG(MNT_UNION);
 4405         MNT_FLAG(MNT_ASYNC);
 4406         MNT_FLAG(MNT_SUIDDIR);
 4407         MNT_FLAG(MNT_SOFTDEP);
 4408         MNT_FLAG(MNT_NOSYMFOLLOW);
 4409         MNT_FLAG(MNT_GJOURNAL);
 4410         MNT_FLAG(MNT_MULTILABEL);
 4411         MNT_FLAG(MNT_ACLS);
 4412         MNT_FLAG(MNT_NOATIME);
 4413         MNT_FLAG(MNT_NOCLUSTERR);
 4414         MNT_FLAG(MNT_NOCLUSTERW);
 4415         MNT_FLAG(MNT_SUJ);
 4416         MNT_FLAG(MNT_EXRDONLY);
 4417         MNT_FLAG(MNT_EXPORTED);
 4418         MNT_FLAG(MNT_DEFEXPORTED);
 4419         MNT_FLAG(MNT_EXPORTANON);
 4420         MNT_FLAG(MNT_EXKERB);
 4421         MNT_FLAG(MNT_EXPUBLIC);
 4422         MNT_FLAG(MNT_LOCAL);
 4423         MNT_FLAG(MNT_QUOTA);
 4424         MNT_FLAG(MNT_ROOTFS);
 4425         MNT_FLAG(MNT_USER);
 4426         MNT_FLAG(MNT_IGNORE);
 4427         MNT_FLAG(MNT_UPDATE);
 4428         MNT_FLAG(MNT_DELEXPORT);
 4429         MNT_FLAG(MNT_RELOAD);
 4430         MNT_FLAG(MNT_FORCE);
 4431         MNT_FLAG(MNT_SNAPSHOT);
 4432         MNT_FLAG(MNT_BYFSID);
 4433 #undef MNT_FLAG
 4434         if (mflags != 0) {
 4435                 if (buf[0] != '\0')
 4436                         strlcat(buf, ", ", sizeof(buf));
 4437                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 4438                     "0x%016jx", mflags);
 4439         }
 4440         db_printf("    mnt_flag = %s\n", buf);
 4441 
 4442         buf[0] = '\0';
 4443         flags = mp->mnt_kern_flag;
 4444 #define MNT_KERN_FLAG(flag)     do {                                    \
 4445         if (flags & (flag)) {                                           \
 4446                 if (buf[0] != '\0')                                     \
 4447                         strlcat(buf, ", ", sizeof(buf));                \
 4448                 strlcat(buf, (#flag) + 5, sizeof(buf));                 \
 4449                 flags &= ~(flag);                                       \
 4450         }                                                               \
 4451 } while (0)
 4452         MNT_KERN_FLAG(MNTK_UNMOUNTF);
 4453         MNT_KERN_FLAG(MNTK_ASYNC);
 4454         MNT_KERN_FLAG(MNTK_SOFTDEP);
 4455         MNT_KERN_FLAG(MNTK_NOMSYNC);
 4456         MNT_KERN_FLAG(MNTK_DRAINING);
 4457         MNT_KERN_FLAG(MNTK_REFEXPIRE);
 4458         MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 4459         MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 4460         MNT_KERN_FLAG(MNTK_NO_IOPF);
 4461         MNT_KERN_FLAG(MNTK_RECURSE);
 4462         MNT_KERN_FLAG(MNTK_UPPER_WAITER);
 4463         MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE);
 4464         MNT_KERN_FLAG(MNTK_USES_BCACHE);
 4465         MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG);
 4466         MNT_KERN_FLAG(MNTK_FPLOOKUP);
 4467         MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER);
 4468         MNT_KERN_FLAG(MNTK_NOASYNC);
 4469         MNT_KERN_FLAG(MNTK_UNMOUNT);
 4470         MNT_KERN_FLAG(MNTK_MWAIT);
 4471         MNT_KERN_FLAG(MNTK_SUSPEND);
 4472         MNT_KERN_FLAG(MNTK_SUSPEND2);
 4473         MNT_KERN_FLAG(MNTK_SUSPENDED);
 4474         MNT_KERN_FLAG(MNTK_NULL_NOCACHE);
 4475         MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 4476 #undef MNT_KERN_FLAG
 4477         if (flags != 0) {
 4478                 if (buf[0] != '\0')
 4479                         strlcat(buf, ", ", sizeof(buf));
 4480                 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 4481                     "0x%08x", flags);
 4482         }
 4483         db_printf("    mnt_kern_flag = %s\n", buf);
 4484 
 4485         db_printf("    mnt_opt = ");
 4486         opt = TAILQ_FIRST(mp->mnt_opt);
 4487         if (opt != NULL) {
 4488                 db_printf("%s", opt->name);
 4489                 opt = TAILQ_NEXT(opt, link);
 4490                 while (opt != NULL) {
 4491                         db_printf(", %s", opt->name);
 4492                         opt = TAILQ_NEXT(opt, link);
 4493                 }
 4494         }
 4495         db_printf("\n");
 4496 
 4497         sp = &mp->mnt_stat;
 4498         db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 4499             "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 4500             "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 4501             "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 4502             (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 4503             (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 4504             (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 4505             (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 4506             (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 4507             (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 4508             (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 4509             (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 4510 
 4511         db_printf("    mnt_cred = { uid=%u ruid=%u",
 4512             (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 4513         if (jailed(mp->mnt_cred))
 4514                 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 4515         db_printf(" }\n");
 4516         db_printf("    mnt_ref = %d (with %d in the struct)\n",
 4517             vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
 4518         db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 4519         db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 4520         db_printf("    mnt_lazyvnodelistsize = %d\n",
 4521             mp->mnt_lazyvnodelistsize);
 4522         db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
 4523             vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
 4524         db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 4525         db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 4526         db_printf("    mnt_lockref = %d (with %d in the struct)\n",
 4527             vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
 4528         db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 4529         db_printf("    mnt_secondary_accwrites = %d\n",
 4530             mp->mnt_secondary_accwrites);
 4531         db_printf("    mnt_gjprovider = %s\n",
 4532             mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 4533         db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
 4534 
 4535         db_printf("\n\nList of active vnodes\n");
 4536         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4537                 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
 4538                         vn_printf(vp, "vnode ");
 4539                         if (db_pager_quit)
 4540                                 break;
 4541                 }
 4542         }
 4543         db_printf("\n\nList of inactive vnodes\n");
 4544         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4545                 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
 4546                         vn_printf(vp, "vnode ");
 4547                         if (db_pager_quit)
 4548                                 break;
 4549                 }
 4550         }
 4551 }
 4552 #endif  /* DDB */
 4553 
 4554 /*
 4555  * Fill in a struct xvfsconf based on a struct vfsconf.
 4556  */
 4557 static int
 4558 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 4559 {
 4560         struct xvfsconf xvfsp;
 4561 
 4562         bzero(&xvfsp, sizeof(xvfsp));
 4563         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 4564         xvfsp.vfc_typenum = vfsp->vfc_typenum;
 4565         xvfsp.vfc_refcount = vfsp->vfc_refcount;
 4566         xvfsp.vfc_flags = vfsp->vfc_flags;
 4567         /*
 4568          * These are unused in userland, we keep them
 4569          * to not break binary compatibility.
 4570          */
 4571         xvfsp.vfc_vfsops = NULL;
 4572         xvfsp.vfc_next = NULL;
 4573         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 4574 }
 4575 
 4576 #ifdef COMPAT_FREEBSD32
 4577 struct xvfsconf32 {
 4578         uint32_t        vfc_vfsops;
 4579         char            vfc_name[MFSNAMELEN];
 4580         int32_t         vfc_typenum;
 4581         int32_t         vfc_refcount;
 4582         int32_t         vfc_flags;
 4583         uint32_t        vfc_next;
 4584 };
 4585 
 4586 static int
 4587 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 4588 {
 4589         struct xvfsconf32 xvfsp;
 4590 
 4591         bzero(&xvfsp, sizeof(xvfsp));
 4592         strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 4593         xvfsp.vfc_typenum = vfsp->vfc_typenum;
 4594         xvfsp.vfc_refcount = vfsp->vfc_refcount;
 4595         xvfsp.vfc_flags = vfsp->vfc_flags;
 4596         return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 4597 }
 4598 #endif
 4599 
 4600 /*
 4601  * Top level filesystem related information gathering.
 4602  */
 4603 static int
 4604 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 4605 {
 4606         struct vfsconf *vfsp;
 4607         int error;
 4608 
 4609         error = 0;
 4610         vfsconf_slock();
 4611         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 4612 #ifdef COMPAT_FREEBSD32
 4613                 if (req->flags & SCTL_MASK32)
 4614                         error = vfsconf2x32(req, vfsp);
 4615                 else
 4616 #endif
 4617                         error = vfsconf2x(req, vfsp);
 4618                 if (error)
 4619                         break;
 4620         }
 4621         vfsconf_sunlock();
 4622         return (error);
 4623 }
 4624 
 4625 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
 4626     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
 4627     "S,xvfsconf", "List of all configured filesystems");
 4628 
 4629 #ifndef BURN_BRIDGES
 4630 static int      sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 4631 
 4632 static int
 4633 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 4634 {
 4635         int *name = (int *)arg1 - 1;    /* XXX */
 4636         u_int namelen = arg2 + 1;       /* XXX */
 4637         struct vfsconf *vfsp;
 4638 
 4639         log(LOG_WARNING, "userland calling deprecated sysctl, "
 4640             "please rebuild world\n");
 4641 
 4642 #if 1 || defined(COMPAT_PRELITE2)
 4643         /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 4644         if (namelen == 1)
 4645                 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 4646 #endif
 4647 
 4648         switch (name[1]) {
 4649         case VFS_MAXTYPENUM:
 4650                 if (namelen != 2)
 4651                         return (ENOTDIR);
 4652                 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 4653         case VFS_CONF:
 4654                 if (namelen != 3)
 4655                         return (ENOTDIR);       /* overloaded */
 4656                 vfsconf_slock();
 4657                 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 4658                         if (vfsp->vfc_typenum == name[2])
 4659                                 break;
 4660                 }
 4661                 vfsconf_sunlock();
 4662                 if (vfsp == NULL)
 4663                         return (EOPNOTSUPP);
 4664 #ifdef COMPAT_FREEBSD32
 4665                 if (req->flags & SCTL_MASK32)
 4666                         return (vfsconf2x32(req, vfsp));
 4667                 else
 4668 #endif
 4669                         return (vfsconf2x(req, vfsp));
 4670         }
 4671         return (EOPNOTSUPP);
 4672 }
 4673 
 4674 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
 4675     CTLFLAG_MPSAFE, vfs_sysctl,
 4676     "Generic filesystem");
 4677 
 4678 #if 1 || defined(COMPAT_PRELITE2)
 4679 
 4680 static int
 4681 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 4682 {
 4683         int error;
 4684         struct vfsconf *vfsp;
 4685         struct ovfsconf ovfs;
 4686 
 4687         vfsconf_slock();
 4688         TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 4689                 bzero(&ovfs, sizeof(ovfs));
 4690                 ovfs.vfc_vfsops = vfsp->vfc_vfsops;     /* XXX used as flag */
 4691                 strcpy(ovfs.vfc_name, vfsp->vfc_name);
 4692                 ovfs.vfc_index = vfsp->vfc_typenum;
 4693                 ovfs.vfc_refcount = vfsp->vfc_refcount;
 4694                 ovfs.vfc_flags = vfsp->vfc_flags;
 4695                 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 4696                 if (error != 0) {
 4697                         vfsconf_sunlock();
 4698                         return (error);
 4699                 }
 4700         }
 4701         vfsconf_sunlock();
 4702         return (0);
 4703 }
 4704 
 4705 #endif /* 1 || COMPAT_PRELITE2 */
 4706 #endif /* !BURN_BRIDGES */
 4707 
 4708 #define KINFO_VNODESLOP         10
 4709 #ifdef notyet
 4710 /*
 4711  * Dump vnode list (via sysctl).
 4712  */
 4713 /* ARGSUSED */
 4714 static int
 4715 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 4716 {
 4717         struct xvnode *xvn;
 4718         struct mount *mp;
 4719         struct vnode *vp;
 4720         int error, len, n;
 4721 
 4722         /*
 4723          * Stale numvnodes access is not fatal here.
 4724          */
 4725         req->lock = 0;
 4726         len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 4727         if (!req->oldptr)
 4728                 /* Make an estimate */
 4729                 return (SYSCTL_OUT(req, 0, len));
 4730 
 4731         error = sysctl_wire_old_buffer(req, 0);
 4732         if (error != 0)
 4733                 return (error);
 4734         xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 4735         n = 0;
 4736         mtx_lock(&mountlist_mtx);
 4737         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 4738                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 4739                         continue;
 4740                 MNT_ILOCK(mp);
 4741                 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 4742                         if (n == len)
 4743                                 break;
 4744                         vref(vp);
 4745                         xvn[n].xv_size = sizeof *xvn;
 4746                         xvn[n].xv_vnode = vp;
 4747                         xvn[n].xv_id = 0;       /* XXX compat */
 4748 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 4749                         XV_COPY(usecount);
 4750                         XV_COPY(writecount);
 4751                         XV_COPY(holdcnt);
 4752                         XV_COPY(mount);
 4753                         XV_COPY(numoutput);
 4754                         XV_COPY(type);
 4755 #undef XV_COPY
 4756                         xvn[n].xv_flag = vp->v_vflag;
 4757 
 4758                         switch (vp->v_type) {
 4759                         case VREG:
 4760                         case VDIR:
 4761                         case VLNK:
 4762                                 break;
 4763                         case VBLK:
 4764                         case VCHR:
 4765                                 if (vp->v_rdev == NULL) {
 4766                                         vrele(vp);
 4767                                         continue;
 4768                                 }
 4769                                 xvn[n].xv_dev = dev2udev(vp->v_rdev);
 4770                                 break;
 4771                         case VSOCK:
 4772                                 xvn[n].xv_socket = vp->v_socket;
 4773                                 break;
 4774                         case VFIFO:
 4775                                 xvn[n].xv_fifo = vp->v_fifoinfo;
 4776                                 break;
 4777                         case VNON:
 4778                         case VBAD:
 4779                         default:
 4780                                 /* shouldn't happen? */
 4781                                 vrele(vp);
 4782                                 continue;
 4783                         }
 4784                         vrele(vp);
 4785                         ++n;
 4786                 }
 4787                 MNT_IUNLOCK(mp);
 4788                 mtx_lock(&mountlist_mtx);
 4789                 vfs_unbusy(mp);
 4790                 if (n == len)
 4791                         break;
 4792         }
 4793         mtx_unlock(&mountlist_mtx);
 4794 
 4795         error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 4796         free(xvn, M_TEMP);
 4797         return (error);
 4798 }
 4799 
 4800 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
 4801     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
 4802     "");
 4803 #endif
 4804 
 4805 static void
 4806 unmount_or_warn(struct mount *mp)
 4807 {
 4808         int error;
 4809 
 4810         error = dounmount(mp, MNT_FORCE, curthread);
 4811         if (error != 0) {
 4812                 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 4813                 if (error == EBUSY)
 4814                         printf("BUSY)\n");
 4815                 else
 4816                         printf("%d)\n", error);
 4817         }
 4818 }
 4819 
 4820 /*
 4821  * Unmount all filesystems. The list is traversed in reverse order
 4822  * of mounting to avoid dependencies.
 4823  */
 4824 void
 4825 vfs_unmountall(void)
 4826 {
 4827         struct mount *mp, *tmp;
 4828 
 4829         CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 4830 
 4831         /*
 4832          * Since this only runs when rebooting, it is not interlocked.
 4833          */
 4834         TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
 4835                 vfs_ref(mp);
 4836 
 4837                 /*
 4838                  * Forcibly unmounting "/dev" before "/" would prevent clean
 4839                  * unmount of the latter.
 4840                  */
 4841                 if (mp == rootdevmp)
 4842                         continue;
 4843 
 4844                 unmount_or_warn(mp);
 4845         }
 4846 
 4847         if (rootdevmp != NULL)
 4848                 unmount_or_warn(rootdevmp);
 4849 }
 4850 
 4851 static void
 4852 vfs_deferred_inactive(struct vnode *vp, int lkflags)
 4853 {
 4854 
 4855         ASSERT_VI_LOCKED(vp, __func__);
 4856         VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set"));
 4857         if ((vp->v_iflag & VI_OWEINACT) == 0) {
 4858                 vdropl(vp);
 4859                 return;
 4860         }
 4861         if (vn_lock(vp, lkflags) == 0) {
 4862                 VI_LOCK(vp);
 4863                 vinactive(vp);
 4864                 VOP_UNLOCK(vp);
 4865                 vdropl(vp);
 4866                 return;
 4867         }
 4868         vdefer_inactive_unlocked(vp);
 4869 }
 4870 
 4871 static int
 4872 vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
 4873 {
 4874 
 4875         return (vp->v_iflag & VI_DEFINACT);
 4876 }
 4877 
 4878 static void __noinline
 4879 vfs_periodic_inactive(struct mount *mp, int flags)
 4880 {
 4881         struct vnode *vp, *mvp;
 4882         int lkflags;
 4883 
 4884         lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 4885         if (flags != MNT_WAIT)
 4886                 lkflags |= LK_NOWAIT;
 4887 
 4888         MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
 4889                 if ((vp->v_iflag & VI_DEFINACT) == 0) {
 4890                         VI_UNLOCK(vp);
 4891                         continue;
 4892                 }
 4893                 vp->v_iflag &= ~VI_DEFINACT;
 4894                 vfs_deferred_inactive(vp, lkflags);
 4895         }
 4896 }
 4897 
 4898 static inline bool
 4899 vfs_want_msync(struct vnode *vp)
 4900 {
 4901         struct vm_object *obj;
 4902 
 4903         /*
 4904          * This test may be performed without any locks held.
 4905          * We rely on vm_object's type stability.
 4906          */
 4907         if (vp->v_vflag & VV_NOSYNC)
 4908                 return (false);
 4909         obj = vp->v_object;
 4910         return (obj != NULL && vm_object_mightbedirty(obj));
 4911 }
 4912 
 4913 static int
 4914 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
 4915 {
 4916 
 4917         if (vp->v_vflag & VV_NOSYNC)
 4918                 return (false);
 4919         if (vp->v_iflag & VI_DEFINACT)
 4920                 return (true);
 4921         return (vfs_want_msync(vp));
 4922 }
 4923 
 4924 static void __noinline
 4925 vfs_periodic_msync_inactive(struct mount *mp, int flags)
 4926 {
 4927         struct vnode *vp, *mvp;
 4928         struct vm_object *obj;
 4929         int lkflags, objflags;
 4930         bool seen_defer;
 4931 
 4932         lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
 4933         if (flags != MNT_WAIT) {
 4934                 lkflags |= LK_NOWAIT;
 4935                 objflags = OBJPC_NOSYNC;
 4936         } else {
 4937                 objflags = OBJPC_SYNC;
 4938         }
 4939 
 4940         MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
 4941                 seen_defer = false;
 4942                 if (vp->v_iflag & VI_DEFINACT) {
 4943                         vp->v_iflag &= ~VI_DEFINACT;
 4944                         seen_defer = true;
 4945                 }
 4946                 if (!vfs_want_msync(vp)) {
 4947                         if (seen_defer)
 4948                                 vfs_deferred_inactive(vp, lkflags);
 4949                         else
 4950                                 VI_UNLOCK(vp);
 4951                         continue;
 4952                 }
 4953                 if (vget(vp, lkflags) == 0) {
 4954                         obj = vp->v_object;
 4955                         if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
 4956                                 VM_OBJECT_WLOCK(obj);
 4957                                 vm_object_page_clean(obj, 0, 0, objflags);
 4958                                 VM_OBJECT_WUNLOCK(obj);
 4959                         }
 4960                         vput(vp);
 4961                         if (seen_defer)
 4962                                 vdrop(vp);
 4963                 } else {
 4964                         if (seen_defer)
 4965                                 vdefer_inactive_unlocked(vp);
 4966                 }
 4967         }
 4968 }
 4969 
 4970 void
 4971 vfs_periodic(struct mount *mp, int flags)
 4972 {
 4973 
 4974         CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 4975 
 4976         if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
 4977                 vfs_periodic_inactive(mp, flags);
 4978         else
 4979                 vfs_periodic_msync_inactive(mp, flags);
 4980 }
 4981 
 4982 static void
 4983 destroy_vpollinfo_free(struct vpollinfo *vi)
 4984 {
 4985 
 4986         knlist_destroy(&vi->vpi_selinfo.si_note);
 4987         mtx_destroy(&vi->vpi_lock);
 4988         free(vi, M_VNODEPOLL);
 4989 }
 4990 
 4991 static void
 4992 destroy_vpollinfo(struct vpollinfo *vi)
 4993 {
 4994 
 4995         knlist_clear(&vi->vpi_selinfo.si_note, 1);
 4996         seldrain(&vi->vpi_selinfo);
 4997         destroy_vpollinfo_free(vi);
 4998 }
 4999 
 5000 /*
 5001  * Initialize per-vnode helper structure to hold poll-related state.
 5002  */
 5003 void
 5004 v_addpollinfo(struct vnode *vp)
 5005 {
 5006         struct vpollinfo *vi;
 5007 
 5008         if (vp->v_pollinfo != NULL)
 5009                 return;
 5010         vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
 5011         mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 5012         knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 5013             vfs_knlunlock, vfs_knl_assert_lock);
 5014         VI_LOCK(vp);
 5015         if (vp->v_pollinfo != NULL) {
 5016                 VI_UNLOCK(vp);
 5017                 destroy_vpollinfo_free(vi);
 5018                 return;
 5019         }
 5020         vp->v_pollinfo = vi;
 5021         VI_UNLOCK(vp);
 5022 }
 5023 
 5024 /*
 5025  * Record a process's interest in events which might happen to
 5026  * a vnode.  Because poll uses the historic select-style interface
 5027  * internally, this routine serves as both the ``check for any
 5028  * pending events'' and the ``record my interest in future events''
 5029  * functions.  (These are done together, while the lock is held,
 5030  * to avoid race conditions.)
 5031  */
 5032 int
 5033 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 5034 {
 5035 
 5036         v_addpollinfo(vp);
 5037         mtx_lock(&vp->v_pollinfo->vpi_lock);
 5038         if (vp->v_pollinfo->vpi_revents & events) {
 5039                 /*
 5040                  * This leaves events we are not interested
 5041                  * in available for the other process which
 5042                  * which presumably had requested them
 5043                  * (otherwise they would never have been
 5044                  * recorded).
 5045                  */
 5046                 events &= vp->v_pollinfo->vpi_revents;
 5047                 vp->v_pollinfo->vpi_revents &= ~events;
 5048 
 5049                 mtx_unlock(&vp->v_pollinfo->vpi_lock);
 5050                 return (events);
 5051         }
 5052         vp->v_pollinfo->vpi_events |= events;
 5053         selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 5054         mtx_unlock(&vp->v_pollinfo->vpi_lock);
 5055         return (0);
 5056 }
 5057 
 5058 /*
 5059  * Routine to create and manage a filesystem syncer vnode.
 5060  */
 5061 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 5062 static int      sync_fsync(struct  vop_fsync_args *);
 5063 static int      sync_inactive(struct  vop_inactive_args *);
 5064 static int      sync_reclaim(struct  vop_reclaim_args *);
 5065 
 5066 static struct vop_vector sync_vnodeops = {
 5067         .vop_bypass =   VOP_EOPNOTSUPP,
 5068         .vop_close =    sync_close,             /* close */
 5069         .vop_fsync =    sync_fsync,             /* fsync */
 5070         .vop_inactive = sync_inactive,  /* inactive */
 5071         .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
 5072         .vop_reclaim =  sync_reclaim,   /* reclaim */
 5073         .vop_lock1 =    vop_stdlock,    /* lock */
 5074         .vop_unlock =   vop_stdunlock,  /* unlock */
 5075         .vop_islocked = vop_stdislocked,        /* islocked */
 5076 };
 5077 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
 5078 
 5079 /*
 5080  * Create a new filesystem syncer vnode for the specified mount point.
 5081  */
 5082 void
 5083 vfs_allocate_syncvnode(struct mount *mp)
 5084 {
 5085         struct vnode *vp;
 5086         struct bufobj *bo;
 5087         static long start, incr, next;
 5088         int error;
 5089 
 5090         /* Allocate a new vnode */
 5091         error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 5092         if (error != 0)
 5093                 panic("vfs_allocate_syncvnode: getnewvnode() failed");
 5094         vp->v_type = VNON;
 5095         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 5096         vp->v_vflag |= VV_FORCEINSMQ;
 5097         error = insmntque1(vp, mp);
 5098         if (error != 0)
 5099                 panic("vfs_allocate_syncvnode: insmntque() failed");
 5100         vp->v_vflag &= ~VV_FORCEINSMQ;
 5101         vn_set_state(vp, VSTATE_CONSTRUCTED);
 5102         VOP_UNLOCK(vp);
 5103         /*
 5104          * Place the vnode onto the syncer worklist. We attempt to
 5105          * scatter them about on the list so that they will go off
 5106          * at evenly distributed times even if all the filesystems
 5107          * are mounted at once.
 5108          */
 5109         next += incr;
 5110         if (next == 0 || next > syncer_maxdelay) {
 5111                 start /= 2;
 5112                 incr /= 2;
 5113                 if (start == 0) {
 5114                         start = syncer_maxdelay / 2;
 5115                         incr = syncer_maxdelay;
 5116                 }
 5117                 next = start;
 5118         }
 5119         bo = &vp->v_bufobj;
 5120         BO_LOCK(bo);
 5121         vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 5122         /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 5123         mtx_lock(&sync_mtx);
 5124         sync_vnode_count++;
 5125         if (mp->mnt_syncer == NULL) {
 5126                 mp->mnt_syncer = vp;
 5127                 vp = NULL;
 5128         }
 5129         mtx_unlock(&sync_mtx);
 5130         BO_UNLOCK(bo);
 5131         if (vp != NULL) {
 5132                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 5133                 vgone(vp);
 5134                 vput(vp);
 5135         }
 5136 }
 5137 
 5138 void
 5139 vfs_deallocate_syncvnode(struct mount *mp)
 5140 {
 5141         struct vnode *vp;
 5142 
 5143         mtx_lock(&sync_mtx);
 5144         vp = mp->mnt_syncer;
 5145         if (vp != NULL)
 5146                 mp->mnt_syncer = NULL;
 5147         mtx_unlock(&sync_mtx);
 5148         if (vp != NULL)
 5149                 vrele(vp);
 5150 }
 5151 
 5152 /*
 5153  * Do a lazy sync of the filesystem.
 5154  */
 5155 static int
 5156 sync_fsync(struct vop_fsync_args *ap)
 5157 {
 5158         struct vnode *syncvp = ap->a_vp;
 5159         struct mount *mp = syncvp->v_mount;
 5160         int error, save;
 5161         struct bufobj *bo;
 5162 
 5163         /*
 5164          * We only need to do something if this is a lazy evaluation.
 5165          */
 5166         if (ap->a_waitfor != MNT_LAZY)
 5167                 return (0);
 5168 
 5169         /*
 5170          * Move ourselves to the back of the sync list.
 5171          */
 5172         bo = &syncvp->v_bufobj;
 5173         BO_LOCK(bo);
 5174         vn_syncer_add_to_worklist(bo, syncdelay);
 5175         BO_UNLOCK(bo);
 5176 
 5177         /*
 5178          * Walk the list of vnodes pushing all that are dirty and
 5179          * not already on the sync list.
 5180          */
 5181         if (vfs_busy(mp, MBF_NOWAIT) != 0)
 5182                 return (0);
 5183         VOP_UNLOCK(syncvp);
 5184         save = curthread_pflags_set(TDP_SYNCIO);
 5185         /*
 5186          * The filesystem at hand may be idle with free vnodes stored in the
 5187          * batch.  Return them instead of letting them stay there indefinitely.
 5188          */
 5189         vfs_periodic(mp, MNT_NOWAIT);
 5190         error = VFS_SYNC(mp, MNT_LAZY);
 5191         curthread_pflags_restore(save);
 5192         vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY);
 5193         vfs_unbusy(mp);
 5194         return (error);
 5195 }
 5196 
 5197 /*
 5198  * The syncer vnode is no referenced.
 5199  */
 5200 static int
 5201 sync_inactive(struct vop_inactive_args *ap)
 5202 {
 5203 
 5204         vgone(ap->a_vp);
 5205         return (0);
 5206 }
 5207 
 5208 /*
 5209  * The syncer vnode is no longer needed and is being decommissioned.
 5210  *
 5211  * Modifications to the worklist must be protected by sync_mtx.
 5212  */
 5213 static int
 5214 sync_reclaim(struct vop_reclaim_args *ap)
 5215 {
 5216         struct vnode *vp = ap->a_vp;
 5217         struct bufobj *bo;
 5218 
 5219         bo = &vp->v_bufobj;
 5220         BO_LOCK(bo);
 5221         mtx_lock(&sync_mtx);
 5222         if (vp->v_mount->mnt_syncer == vp)
 5223                 vp->v_mount->mnt_syncer = NULL;
 5224         if (bo->bo_flag & BO_ONWORKLST) {
 5225                 LIST_REMOVE(bo, bo_synclist);
 5226                 syncer_worklist_len--;
 5227                 sync_vnode_count--;
 5228                 bo->bo_flag &= ~BO_ONWORKLST;
 5229         }
 5230         mtx_unlock(&sync_mtx);
 5231         BO_UNLOCK(bo);
 5232 
 5233         return (0);
 5234 }
 5235 
 5236 int
 5237 vn_need_pageq_flush(struct vnode *vp)
 5238 {
 5239         struct vm_object *obj;
 5240 
 5241         obj = vp->v_object;
 5242         return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 5243             vm_object_mightbedirty(obj));
 5244 }
 5245 
 5246 /*
 5247  * Check if vnode represents a disk device
 5248  */
 5249 bool
 5250 vn_isdisk_error(struct vnode *vp, int *errp)
 5251 {
 5252         int error;
 5253 
 5254         if (vp->v_type != VCHR) {
 5255                 error = ENOTBLK;
 5256                 goto out;
 5257         }
 5258         error = 0;
 5259         dev_lock();
 5260         if (vp->v_rdev == NULL)
 5261                 error = ENXIO;
 5262         else if (vp->v_rdev->si_devsw == NULL)
 5263                 error = ENXIO;
 5264         else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 5265                 error = ENOTBLK;
 5266         dev_unlock();
 5267 out:
 5268         *errp = error;
 5269         return (error == 0);
 5270 }
 5271 
 5272 bool
 5273 vn_isdisk(struct vnode *vp)
 5274 {
 5275         int error;
 5276 
 5277         return (vn_isdisk_error(vp, &error));
 5278 }
 5279 
 5280 /*
 5281  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
 5282  * the comment above cache_fplookup for details.
 5283  */
 5284 int
 5285 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
 5286 {
 5287         int error;
 5288 
 5289         VFS_SMR_ASSERT_ENTERED();
 5290 
 5291         /* Check the owner. */
 5292         if (cred->cr_uid == file_uid) {
 5293                 if (file_mode & S_IXUSR)
 5294                         return (0);
 5295                 goto out_error;
 5296         }
 5297 
 5298         /* Otherwise, check the groups (first match) */
 5299         if (groupmember(file_gid, cred)) {
 5300                 if (file_mode & S_IXGRP)
 5301                         return (0);
 5302                 goto out_error;
 5303         }
 5304 
 5305         /* Otherwise, check everyone else. */
 5306         if (file_mode & S_IXOTH)
 5307                 return (0);
 5308 out_error:
 5309         /*
 5310          * Permission check failed, but it is possible denial will get overwritten
 5311          * (e.g., when root is traversing through a 700 directory owned by someone
 5312          * else).
 5313          *
 5314          * vaccess() calls priv_check_cred which in turn can descent into MAC
 5315          * modules overriding this result. It's quite unclear what semantics
 5316          * are allowed for them to operate, thus for safety we don't call them
 5317          * from within the SMR section. This also means if any such modules
 5318          * are present, we have to let the regular lookup decide.
 5319          */
 5320         error = priv_check_cred_vfs_lookup_nomac(cred);
 5321         switch (error) {
 5322         case 0:
 5323                 return (0);
 5324         case EAGAIN:
 5325                 /*
 5326                  * MAC modules present.
 5327                  */
 5328                 return (EAGAIN);
 5329         case EPERM:
 5330                 return (EACCES);
 5331         default:
 5332                 return (error);
 5333         }
 5334 }
 5335 
 5336 /*
 5337  * Common filesystem object access control check routine.  Accepts a
 5338  * vnode's type, "mode", uid and gid, requested access mode, and credentials.
 5339  * Returns 0 on success, or an errno on failure.
 5340  */
 5341 int
 5342 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
 5343     accmode_t accmode, struct ucred *cred)
 5344 {
 5345         accmode_t dac_granted;
 5346         accmode_t priv_granted;
 5347 
 5348         KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 5349             ("invalid bit in accmode"));
 5350         KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 5351             ("VAPPEND without VWRITE"));
 5352 
 5353         /*
 5354          * Look for a normal, non-privileged way to access the file/directory
 5355          * as requested.  If it exists, go with that.
 5356          */
 5357 
 5358         dac_granted = 0;
 5359 
 5360         /* Check the owner. */
 5361         if (cred->cr_uid == file_uid) {
 5362                 dac_granted |= VADMIN;
 5363                 if (file_mode & S_IXUSR)
 5364                         dac_granted |= VEXEC;
 5365                 if (file_mode & S_IRUSR)
 5366                         dac_granted |= VREAD;
 5367                 if (file_mode & S_IWUSR)
 5368                         dac_granted |= (VWRITE | VAPPEND);
 5369 
 5370                 if ((accmode & dac_granted) == accmode)
 5371                         return (0);
 5372 
 5373                 goto privcheck;
 5374         }
 5375 
 5376         /* Otherwise, check the groups (first match) */
 5377         if (groupmember(file_gid, cred)) {
 5378                 if (file_mode & S_IXGRP)
 5379                         dac_granted |= VEXEC;
 5380                 if (file_mode & S_IRGRP)
 5381                         dac_granted |= VREAD;
 5382                 if (file_mode & S_IWGRP)
 5383                         dac_granted |= (VWRITE | VAPPEND);
 5384 
 5385                 if ((accmode & dac_granted) == accmode)
 5386                         return (0);
 5387 
 5388                 goto privcheck;
 5389         }
 5390 
 5391         /* Otherwise, check everyone else. */
 5392         if (file_mode & S_IXOTH)
 5393                 dac_granted |= VEXEC;
 5394         if (file_mode & S_IROTH)
 5395                 dac_granted |= VREAD;
 5396         if (file_mode & S_IWOTH)
 5397                 dac_granted |= (VWRITE | VAPPEND);
 5398         if ((accmode & dac_granted) == accmode)
 5399                 return (0);
 5400 
 5401 privcheck:
 5402         /*
 5403          * Build a privilege mask to determine if the set of privileges
 5404          * satisfies the requirements when combined with the granted mask
 5405          * from above.  For each privilege, if the privilege is required,
 5406          * bitwise or the request type onto the priv_granted mask.
 5407          */
 5408         priv_granted = 0;
 5409 
 5410         if (type == VDIR) {
 5411                 /*
 5412                  * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 5413                  * requests, instead of PRIV_VFS_EXEC.
 5414                  */
 5415                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 5416                     !priv_check_cred(cred, PRIV_VFS_LOOKUP))
 5417                         priv_granted |= VEXEC;
 5418         } else {
 5419                 /*
 5420                  * Ensure that at least one execute bit is on. Otherwise,
 5421                  * a privileged user will always succeed, and we don't want
 5422                  * this to happen unless the file really is executable.
 5423                  */
 5424                 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 5425                     (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 5426                     !priv_check_cred(cred, PRIV_VFS_EXEC))
 5427                         priv_granted |= VEXEC;
 5428         }
 5429 
 5430         if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 5431             !priv_check_cred(cred, PRIV_VFS_READ))
 5432                 priv_granted |= VREAD;
 5433 
 5434         if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 5435             !priv_check_cred(cred, PRIV_VFS_WRITE))
 5436                 priv_granted |= (VWRITE | VAPPEND);
 5437 
 5438         if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 5439             !priv_check_cred(cred, PRIV_VFS_ADMIN))
 5440                 priv_granted |= VADMIN;
 5441 
 5442         if ((accmode & (priv_granted | dac_granted)) == accmode) {
 5443                 return (0);
 5444         }
 5445 
 5446         return ((accmode & VADMIN) ? EPERM : EACCES);
 5447 }
 5448 
 5449 /*
 5450  * Credential check based on process requesting service, and per-attribute
 5451  * permissions.
 5452  */
 5453 int
 5454 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
 5455     struct thread *td, accmode_t accmode)
 5456 {
 5457 
 5458         /*
 5459          * Kernel-invoked always succeeds.
 5460          */
 5461         if (cred == NOCRED)
 5462                 return (0);
 5463 
 5464         /*
 5465          * Do not allow privileged processes in jail to directly manipulate
 5466          * system attributes.
 5467          */
 5468         switch (attrnamespace) {
 5469         case EXTATTR_NAMESPACE_SYSTEM:
 5470                 /* Potentially should be: return (EPERM); */
 5471                 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 5472         case EXTATTR_NAMESPACE_USER:
 5473                 return (VOP_ACCESS(vp, accmode, cred, td));
 5474         default:
 5475                 return (EPERM);
 5476         }
 5477 }
 5478 
 5479 #ifdef DEBUG_VFS_LOCKS
 5480 int vfs_badlock_ddb = 1;        /* Drop into debugger on violation. */
 5481 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
 5482     "Drop into debugger on lock violation");
 5483 
 5484 int vfs_badlock_mutex = 1;      /* Check for interlock across VOPs. */
 5485 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
 5486     0, "Check for interlock across VOPs");
 5487 
 5488 int vfs_badlock_print = 1;      /* Print lock violations. */
 5489 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
 5490     0, "Print lock violations");
 5491 
 5492 int vfs_badlock_vnode = 1;      /* Print vnode details on lock violations. */
 5493 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
 5494     0, "Print vnode details on lock violations");
 5495 
 5496 #ifdef KDB
 5497 int vfs_badlock_backtrace = 1;  /* Print backtrace at lock violations. */
 5498 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
 5499     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 5500 #endif
 5501 
 5502 static void
 5503 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 5504 {
 5505 
 5506 #ifdef KDB
 5507         if (vfs_badlock_backtrace)
 5508                 kdb_backtrace();
 5509 #endif
 5510         if (vfs_badlock_vnode)
 5511                 vn_printf(vp, "vnode ");
 5512         if (vfs_badlock_print)
 5513                 printf("%s: %p %s\n", str, (void *)vp, msg);
 5514         if (vfs_badlock_ddb)
 5515                 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 5516 }
 5517 
 5518 void
 5519 assert_vi_locked(struct vnode *vp, const char *str)
 5520 {
 5521 
 5522         if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 5523                 vfs_badlock("interlock is not locked but should be", str, vp);
 5524 }
 5525 
 5526 void
 5527 assert_vi_unlocked(struct vnode *vp, const char *str)
 5528 {
 5529 
 5530         if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 5531                 vfs_badlock("interlock is locked but should not be", str, vp);
 5532 }
 5533 
 5534 void
 5535 assert_vop_locked(struct vnode *vp, const char *str)
 5536 {
 5537         int locked;
 5538 
 5539         if (KERNEL_PANICKED() || vp == NULL)
 5540                 return;
 5541 
 5542         locked = VOP_ISLOCKED(vp);
 5543         if (locked == 0 || locked == LK_EXCLOTHER)
 5544                 vfs_badlock("is not locked but should be", str, vp);
 5545 }
 5546 
 5547 void
 5548 assert_vop_unlocked(struct vnode *vp, const char *str)
 5549 {
 5550         if (KERNEL_PANICKED() || vp == NULL)
 5551                 return;
 5552 
 5553         if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 5554                 vfs_badlock("is locked but should not be", str, vp);
 5555 }
 5556 
 5557 void
 5558 assert_vop_elocked(struct vnode *vp, const char *str)
 5559 {
 5560         if (KERNEL_PANICKED() || vp == NULL)
 5561                 return;
 5562 
 5563         if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 5564                 vfs_badlock("is not exclusive locked but should be", str, vp);
 5565 }
 5566 #endif /* DEBUG_VFS_LOCKS */
 5567 
 5568 void
 5569 vop_rename_fail(struct vop_rename_args *ap)
 5570 {
 5571 
 5572         if (ap->a_tvp != NULL)
 5573                 vput(ap->a_tvp);
 5574         if (ap->a_tdvp == ap->a_tvp)
 5575                 vrele(ap->a_tdvp);
 5576         else
 5577                 vput(ap->a_tdvp);
 5578         vrele(ap->a_fdvp);
 5579         vrele(ap->a_fvp);
 5580 }
 5581 
 5582 void
 5583 vop_rename_pre(void *ap)
 5584 {
 5585         struct vop_rename_args *a = ap;
 5586 
 5587 #ifdef DEBUG_VFS_LOCKS
 5588         if (a->a_tvp)
 5589                 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 5590         ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 5591         ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 5592         ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 5593 
 5594         /* Check the source (from). */
 5595         if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 5596             (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 5597                 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 5598         if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 5599                 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 5600 
 5601         /* Check the target. */
 5602         if (a->a_tvp)
 5603                 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 5604         ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 5605 #endif
 5606         /*
 5607          * It may be tempting to add vn_seqc_write_begin/end calls here and
 5608          * in vop_rename_post but that's not going to work out since some
 5609          * filesystems relookup vnodes mid-rename. This is probably a bug.
 5610          *
 5611          * For now filesystems are expected to do the relevant calls after they
 5612          * decide what vnodes to operate on.
 5613          */
 5614         if (a->a_tdvp != a->a_fdvp)
 5615                 vhold(a->a_fdvp);
 5616         if (a->a_tvp != a->a_fvp)
 5617                 vhold(a->a_fvp);
 5618         vhold(a->a_tdvp);
 5619         if (a->a_tvp)
 5620                 vhold(a->a_tvp);
 5621 }
 5622 
 5623 #ifdef DEBUG_VFS_LOCKS
 5624 void
 5625 vop_fplookup_vexec_debugpre(void *ap __unused)
 5626 {
 5627 
 5628         VFS_SMR_ASSERT_ENTERED();
 5629 }
 5630 
 5631 void
 5632 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
 5633 {
 5634 
 5635         VFS_SMR_ASSERT_ENTERED();
 5636 }
 5637 
 5638 void
 5639 vop_fplookup_symlink_debugpre(void *ap __unused)
 5640 {
 5641 
 5642         VFS_SMR_ASSERT_ENTERED();
 5643 }
 5644 
 5645 void
 5646 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused)
 5647 {
 5648 
 5649         VFS_SMR_ASSERT_ENTERED();
 5650 }
 5651 
 5652 static void
 5653 vop_fsync_debugprepost(struct vnode *vp, const char *name)
 5654 {
 5655         if (vp->v_type == VCHR)
 5656                 ;
 5657         else if (MNT_EXTENDED_SHARED(vp->v_mount))
 5658                 ASSERT_VOP_LOCKED(vp, name);
 5659         else
 5660                 ASSERT_VOP_ELOCKED(vp, name);
 5661 }
 5662 
 5663 void
 5664 vop_fsync_debugpre(void *a)
 5665 {
 5666         struct vop_fsync_args *ap;
 5667 
 5668         ap = a;
 5669         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5670 }
 5671 
 5672 void
 5673 vop_fsync_debugpost(void *a, int rc __unused)
 5674 {
 5675         struct vop_fsync_args *ap;
 5676 
 5677         ap = a;
 5678         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5679 }
 5680 
 5681 void
 5682 vop_fdatasync_debugpre(void *a)
 5683 {
 5684         struct vop_fdatasync_args *ap;
 5685 
 5686         ap = a;
 5687         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5688 }
 5689 
 5690 void
 5691 vop_fdatasync_debugpost(void *a, int rc __unused)
 5692 {
 5693         struct vop_fdatasync_args *ap;
 5694 
 5695         ap = a;
 5696         vop_fsync_debugprepost(ap->a_vp, "fsync");
 5697 }
 5698 
 5699 void
 5700 vop_strategy_debugpre(void *ap)
 5701 {
 5702         struct vop_strategy_args *a;
 5703         struct buf *bp;
 5704 
 5705         a = ap;
 5706         bp = a->a_bp;
 5707 
 5708         /*
 5709          * Cluster ops lock their component buffers but not the IO container.
 5710          */
 5711         if ((bp->b_flags & B_CLUSTER) != 0)
 5712                 return;
 5713 
 5714         if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
 5715                 if (vfs_badlock_print)
 5716                         printf(
 5717                             "VOP_STRATEGY: bp is not locked but should be\n");
 5718                 if (vfs_badlock_ddb)
 5719                         kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 5720         }
 5721 }
 5722 
 5723 void
 5724 vop_lock_debugpre(void *ap)
 5725 {
 5726         struct vop_lock1_args *a = ap;
 5727 
 5728         if ((a->a_flags & LK_INTERLOCK) == 0)
 5729                 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 5730         else
 5731                 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 5732 }
 5733 
 5734 void
 5735 vop_lock_debugpost(void *ap, int rc)
 5736 {
 5737         struct vop_lock1_args *a = ap;
 5738 
 5739         ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 5740         if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 5741                 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 5742 }
 5743 
 5744 void
 5745 vop_unlock_debugpre(void *ap)
 5746 {
 5747         struct vop_unlock_args *a = ap;
 5748         struct vnode *vp = a->a_vp;
 5749 
 5750         VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp);
 5751         ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK");
 5752 }
 5753 
 5754 void
 5755 vop_need_inactive_debugpre(void *ap)
 5756 {
 5757         struct vop_need_inactive_args *a = ap;
 5758 
 5759         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 5760 }
 5761 
 5762 void
 5763 vop_need_inactive_debugpost(void *ap, int rc)
 5764 {
 5765         struct vop_need_inactive_args *a = ap;
 5766 
 5767         ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 5768 }
 5769 #endif
 5770 
 5771 void
 5772 vop_create_pre(void *ap)
 5773 {
 5774         struct vop_create_args *a;
 5775         struct vnode *dvp;
 5776 
 5777         a = ap;
 5778         dvp = a->a_dvp;
 5779         vn_seqc_write_begin(dvp);
 5780 }
 5781 
 5782 void
 5783 vop_create_post(void *ap, int rc)
 5784 {
 5785         struct vop_create_args *a;
 5786         struct vnode *dvp;
 5787 
 5788         a = ap;
 5789         dvp = a->a_dvp;
 5790         vn_seqc_write_end(dvp);
 5791         if (!rc)
 5792                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 5793 }
 5794 
 5795 void
 5796 vop_whiteout_pre(void *ap)
 5797 {
 5798         struct vop_whiteout_args *a;
 5799         struct vnode *dvp;
 5800 
 5801         a = ap;
 5802         dvp = a->a_dvp;
 5803         vn_seqc_write_begin(dvp);
 5804 }
 5805 
 5806 void
 5807 vop_whiteout_post(void *ap, int rc)
 5808 {
 5809         struct vop_whiteout_args *a;
 5810         struct vnode *dvp;
 5811 
 5812         a = ap;
 5813         dvp = a->a_dvp;
 5814         vn_seqc_write_end(dvp);
 5815 }
 5816 
 5817 void
 5818 vop_deleteextattr_pre(void *ap)
 5819 {
 5820         struct vop_deleteextattr_args *a;
 5821         struct vnode *vp;
 5822 
 5823         a = ap;
 5824         vp = a->a_vp;
 5825         vn_seqc_write_begin(vp);
 5826 }
 5827 
 5828 void
 5829 vop_deleteextattr_post(void *ap, int rc)
 5830 {
 5831         struct vop_deleteextattr_args *a;
 5832         struct vnode *vp;
 5833 
 5834         a = ap;
 5835         vp = a->a_vp;
 5836         vn_seqc_write_end(vp);
 5837         if (!rc)
 5838                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 5839 }
 5840 
 5841 void
 5842 vop_link_pre(void *ap)
 5843 {
 5844         struct vop_link_args *a;
 5845         struct vnode *vp, *tdvp;
 5846 
 5847         a = ap;
 5848         vp = a->a_vp;
 5849         tdvp = a->a_tdvp;
 5850         vn_seqc_write_begin(vp);
 5851         vn_seqc_write_begin(tdvp);
 5852 }
 5853 
 5854 void
 5855 vop_link_post(void *ap, int rc)
 5856 {
 5857         struct vop_link_args *a;
 5858         struct vnode *vp, *tdvp;
 5859 
 5860         a = ap;
 5861         vp = a->a_vp;
 5862         tdvp = a->a_tdvp;
 5863         vn_seqc_write_end(vp);
 5864         vn_seqc_write_end(tdvp);
 5865         if (!rc) {
 5866                 VFS_KNOTE_LOCKED(vp, NOTE_LINK);
 5867                 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
 5868         }
 5869 }
 5870 
 5871 void
 5872 vop_mkdir_pre(void *ap)
 5873 {
 5874         struct vop_mkdir_args *a;
 5875         struct vnode *dvp;
 5876 
 5877         a = ap;
 5878         dvp = a->a_dvp;
 5879         vn_seqc_write_begin(dvp);
 5880 }
 5881 
 5882 void
 5883 vop_mkdir_post(void *ap, int rc)
 5884 {
 5885         struct vop_mkdir_args *a;
 5886         struct vnode *dvp;
 5887 
 5888         a = ap;
 5889         dvp = a->a_dvp;
 5890         vn_seqc_write_end(dvp);
 5891         if (!rc)
 5892                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 5893 }
 5894 
 5895 #ifdef DEBUG_VFS_LOCKS
 5896 void
 5897 vop_mkdir_debugpost(void *ap, int rc)
 5898 {
 5899         struct vop_mkdir_args *a;
 5900 
 5901         a = ap;
 5902         if (!rc)
 5903                 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp);
 5904 }
 5905 #endif
 5906 
 5907 void
 5908 vop_mknod_pre(void *ap)
 5909 {
 5910         struct vop_mknod_args *a;
 5911         struct vnode *dvp;
 5912 
 5913         a = ap;
 5914         dvp = a->a_dvp;
 5915         vn_seqc_write_begin(dvp);
 5916 }
 5917 
 5918 void
 5919 vop_mknod_post(void *ap, int rc)
 5920 {
 5921         struct vop_mknod_args *a;
 5922         struct vnode *dvp;
 5923 
 5924         a = ap;
 5925         dvp = a->a_dvp;
 5926         vn_seqc_write_end(dvp);
 5927         if (!rc)
 5928                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 5929 }
 5930 
 5931 void
 5932 vop_reclaim_post(void *ap, int rc)
 5933 {
 5934         struct vop_reclaim_args *a;
 5935         struct vnode *vp;
 5936 
 5937         a = ap;
 5938         vp = a->a_vp;
 5939         ASSERT_VOP_IN_SEQC(vp);
 5940         if (!rc)
 5941                 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
 5942 }
 5943 
 5944 void
 5945 vop_remove_pre(void *ap)
 5946 {
 5947         struct vop_remove_args *a;
 5948         struct vnode *dvp, *vp;
 5949 
 5950         a = ap;
 5951         dvp = a->a_dvp;
 5952         vp = a->a_vp;
 5953         vn_seqc_write_begin(dvp);
 5954         vn_seqc_write_begin(vp);
 5955 }
 5956 
 5957 void
 5958 vop_remove_post(void *ap, int rc)
 5959 {
 5960         struct vop_remove_args *a;
 5961         struct vnode *dvp, *vp;
 5962 
 5963         a = ap;
 5964         dvp = a->a_dvp;
 5965         vp = a->a_vp;
 5966         vn_seqc_write_end(dvp);
 5967         vn_seqc_write_end(vp);
 5968         if (!rc) {
 5969                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 5970                 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 5971         }
 5972 }
 5973 
 5974 void
 5975 vop_rename_post(void *ap, int rc)
 5976 {
 5977         struct vop_rename_args *a = ap;
 5978         long hint;
 5979 
 5980         if (!rc) {
 5981                 hint = NOTE_WRITE;
 5982                 if (a->a_fdvp == a->a_tdvp) {
 5983                         if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
 5984                                 hint |= NOTE_LINK;
 5985                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 5986                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 5987                 } else {
 5988                         hint |= NOTE_EXTEND;
 5989                         if (a->a_fvp->v_type == VDIR)
 5990                                 hint |= NOTE_LINK;
 5991                         VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 5992 
 5993                         if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
 5994                             a->a_tvp->v_type == VDIR)
 5995                                 hint &= ~NOTE_LINK;
 5996                         VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 5997                 }
 5998 
 5999                 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 6000                 if (a->a_tvp)
 6001                         VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 6002         }
 6003         if (a->a_tdvp != a->a_fdvp)
 6004                 vdrop(a->a_fdvp);
 6005         if (a->a_tvp != a->a_fvp)
 6006                 vdrop(a->a_fvp);
 6007         vdrop(a->a_tdvp);
 6008         if (a->a_tvp)
 6009                 vdrop(a->a_tvp);
 6010 }
 6011 
 6012 void
 6013 vop_rmdir_pre(void *ap)
 6014 {
 6015         struct vop_rmdir_args *a;
 6016         struct vnode *dvp, *vp;
 6017 
 6018         a = ap;
 6019         dvp = a->a_dvp;
 6020         vp = a->a_vp;
 6021         vn_seqc_write_begin(dvp);
 6022         vn_seqc_write_begin(vp);
 6023 }
 6024 
 6025 void
 6026 vop_rmdir_post(void *ap, int rc)
 6027 {
 6028         struct vop_rmdir_args *a;
 6029         struct vnode *dvp, *vp;
 6030 
 6031         a = ap;
 6032         dvp = a->a_dvp;
 6033         vp = a->a_vp;
 6034         vn_seqc_write_end(dvp);
 6035         vn_seqc_write_end(vp);
 6036         if (!rc) {
 6037                 vp->v_vflag |= VV_UNLINKED;
 6038                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
 6039                 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
 6040         }
 6041 }
 6042 
 6043 void
 6044 vop_setattr_pre(void *ap)
 6045 {
 6046         struct vop_setattr_args *a;
 6047         struct vnode *vp;
 6048 
 6049         a = ap;
 6050         vp = a->a_vp;
 6051         vn_seqc_write_begin(vp);
 6052 }
 6053 
 6054 void
 6055 vop_setattr_post(void *ap, int rc)
 6056 {
 6057         struct vop_setattr_args *a;
 6058         struct vnode *vp;
 6059 
 6060         a = ap;
 6061         vp = a->a_vp;
 6062         vn_seqc_write_end(vp);
 6063         if (!rc)
 6064                 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 6065 }
 6066 
 6067 void
 6068 vop_setacl_pre(void *ap)
 6069 {
 6070         struct vop_setacl_args *a;
 6071         struct vnode *vp;
 6072 
 6073         a = ap;
 6074         vp = a->a_vp;
 6075         vn_seqc_write_begin(vp);
 6076 }
 6077 
 6078 void
 6079 vop_setacl_post(void *ap, int rc __unused)
 6080 {
 6081         struct vop_setacl_args *a;
 6082         struct vnode *vp;
 6083 
 6084         a = ap;
 6085         vp = a->a_vp;
 6086         vn_seqc_write_end(vp);
 6087 }
 6088 
 6089 void
 6090 vop_setextattr_pre(void *ap)
 6091 {
 6092         struct vop_setextattr_args *a;
 6093         struct vnode *vp;
 6094 
 6095         a = ap;
 6096         vp = a->a_vp;
 6097         vn_seqc_write_begin(vp);
 6098 }
 6099 
 6100 void
 6101 vop_setextattr_post(void *ap, int rc)
 6102 {
 6103         struct vop_setextattr_args *a;
 6104         struct vnode *vp;
 6105 
 6106         a = ap;
 6107         vp = a->a_vp;
 6108         vn_seqc_write_end(vp);
 6109         if (!rc)
 6110                 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
 6111 }
 6112 
 6113 void
 6114 vop_symlink_pre(void *ap)
 6115 {
 6116         struct vop_symlink_args *a;
 6117         struct vnode *dvp;
 6118 
 6119         a = ap;
 6120         dvp = a->a_dvp;
 6121         vn_seqc_write_begin(dvp);
 6122 }
 6123 
 6124 void
 6125 vop_symlink_post(void *ap, int rc)
 6126 {
 6127         struct vop_symlink_args *a;
 6128         struct vnode *dvp;
 6129 
 6130         a = ap;
 6131         dvp = a->a_dvp;
 6132         vn_seqc_write_end(dvp);
 6133         if (!rc)
 6134                 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
 6135 }
 6136 
 6137 void
 6138 vop_open_post(void *ap, int rc)
 6139 {
 6140         struct vop_open_args *a = ap;
 6141 
 6142         if (!rc)
 6143                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
 6144 }
 6145 
 6146 void
 6147 vop_close_post(void *ap, int rc)
 6148 {
 6149         struct vop_close_args *a = ap;
 6150 
 6151         if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
 6152             !VN_IS_DOOMED(a->a_vp))) {
 6153                 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 6154                     NOTE_CLOSE_WRITE : NOTE_CLOSE);
 6155         }
 6156 }
 6157 
 6158 void
 6159 vop_read_post(void *ap, int rc)
 6160 {
 6161         struct vop_read_args *a = ap;
 6162 
 6163         if (!rc)
 6164                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 6165 }
 6166 
 6167 void
 6168 vop_read_pgcache_post(void *ap, int rc)
 6169 {
 6170         struct vop_read_pgcache_args *a = ap;
 6171 
 6172         if (!rc)
 6173                 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
 6174 }
 6175 
 6176 void
 6177 vop_readdir_post(void *ap, int rc)
 6178 {
 6179         struct vop_readdir_args *a = ap;
 6180 
 6181         if (!rc)
 6182                 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 6183 }
 6184 
 6185 static struct knlist fs_knlist;
 6186 
 6187 static void
 6188 vfs_event_init(void *arg)
 6189 {
 6190         knlist_init_mtx(&fs_knlist, NULL);
 6191 }
 6192 /* XXX - correct order? */
 6193 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 6194 
 6195 void
 6196 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 6197 {
 6198 
 6199         KNOTE_UNLOCKED(&fs_knlist, event);
 6200 }
 6201 
 6202 static int      filt_fsattach(struct knote *kn);
 6203 static void     filt_fsdetach(struct knote *kn);
 6204 static int      filt_fsevent(struct knote *kn, long hint);
 6205 
 6206 struct filterops fs_filtops = {
 6207         .f_isfd = 0,
 6208         .f_attach = filt_fsattach,
 6209         .f_detach = filt_fsdetach,
 6210         .f_event = filt_fsevent
 6211 };
 6212 
 6213 static int
 6214 filt_fsattach(struct knote *kn)
 6215 {
 6216 
 6217         kn->kn_flags |= EV_CLEAR;
 6218         knlist_add(&fs_knlist, kn, 0);
 6219         return (0);
 6220 }
 6221 
 6222 static void
 6223 filt_fsdetach(struct knote *kn)
 6224 {
 6225 
 6226         knlist_remove(&fs_knlist, kn, 0);
 6227 }
 6228 
 6229 static int
 6230 filt_fsevent(struct knote *kn, long hint)
 6231 {
 6232 
 6233         kn->kn_fflags |= kn->kn_sfflags & hint;
 6234 
 6235         return (kn->kn_fflags != 0);
 6236 }
 6237 
 6238 static int
 6239 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 6240 {
 6241         struct vfsidctl vc;
 6242         int error;
 6243         struct mount *mp;
 6244 
 6245         error = SYSCTL_IN(req, &vc, sizeof(vc));
 6246         if (error)
 6247                 return (error);
 6248         if (vc.vc_vers != VFS_CTL_VERS1)
 6249                 return (EINVAL);
 6250         mp = vfs_getvfs(&vc.vc_fsid);
 6251         if (mp == NULL)
 6252                 return (ENOENT);
 6253         /* ensure that a specific sysctl goes to the right filesystem. */
 6254         if (strcmp(vc.vc_fstypename, "*") != 0 &&
 6255             strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 6256                 vfs_rel(mp);
 6257                 return (EINVAL);
 6258         }
 6259         VCTLTOREQ(&vc, req);
 6260         error = VFS_SYSCTL(mp, vc.vc_op, req);
 6261         vfs_rel(mp);
 6262         return (error);
 6263 }
 6264 
 6265 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
 6266     NULL, 0, sysctl_vfs_ctl, "",
 6267     "Sysctl by fsid");
 6268 
 6269 /*
 6270  * Function to initialize a va_filerev field sensibly.
 6271  * XXX: Wouldn't a random number make a lot more sense ??
 6272  */
 6273 u_quad_t
 6274 init_va_filerev(void)
 6275 {
 6276         struct bintime bt;
 6277 
 6278         getbinuptime(&bt);
 6279         return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 6280 }
 6281 
 6282 static int      filt_vfsread(struct knote *kn, long hint);
 6283 static int      filt_vfswrite(struct knote *kn, long hint);
 6284 static int      filt_vfsvnode(struct knote *kn, long hint);
 6285 static void     filt_vfsdetach(struct knote *kn);
 6286 static struct filterops vfsread_filtops = {
 6287         .f_isfd = 1,
 6288         .f_detach = filt_vfsdetach,
 6289         .f_event = filt_vfsread
 6290 };
 6291 static struct filterops vfswrite_filtops = {
 6292         .f_isfd = 1,
 6293         .f_detach = filt_vfsdetach,
 6294         .f_event = filt_vfswrite
 6295 };
 6296 static struct filterops vfsvnode_filtops = {
 6297         .f_isfd = 1,
 6298         .f_detach = filt_vfsdetach,
 6299         .f_event = filt_vfsvnode
 6300 };
 6301 
 6302 static void
 6303 vfs_knllock(void *arg)
 6304 {
 6305         struct vnode *vp = arg;
 6306 
 6307         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 6308 }
 6309 
 6310 static void
 6311 vfs_knlunlock(void *arg)
 6312 {
 6313         struct vnode *vp = arg;
 6314 
 6315         VOP_UNLOCK(vp);
 6316 }
 6317 
 6318 static void
 6319 vfs_knl_assert_lock(void *arg, int what)
 6320 {
 6321 #ifdef DEBUG_VFS_LOCKS
 6322         struct vnode *vp = arg;
 6323 
 6324         if (what == LA_LOCKED)
 6325                 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 6326         else
 6327                 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 6328 #endif
 6329 }
 6330 
 6331 int
 6332 vfs_kqfilter(struct vop_kqfilter_args *ap)
 6333 {
 6334         struct vnode *vp = ap->a_vp;
 6335         struct knote *kn = ap->a_kn;
 6336         struct knlist *knl;
 6337 
 6338         KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ &&
 6339             kn->kn_filter != EVFILT_WRITE),
 6340             ("READ/WRITE filter on a FIFO leaked through"));
 6341         switch (kn->kn_filter) {
 6342         case EVFILT_READ:
 6343                 kn->kn_fop = &vfsread_filtops;
 6344                 break;
 6345         case EVFILT_WRITE:
 6346                 kn->kn_fop = &vfswrite_filtops;
 6347                 break;
 6348         case EVFILT_VNODE:
 6349                 kn->kn_fop = &vfsvnode_filtops;
 6350                 break;
 6351         default:
 6352                 return (EINVAL);
 6353         }
 6354 
 6355         kn->kn_hook = (caddr_t)vp;
 6356 
 6357         v_addpollinfo(vp);
 6358         if (vp->v_pollinfo == NULL)
 6359                 return (ENOMEM);
 6360         knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 6361         vhold(vp);
 6362         knlist_add(knl, kn, 0);
 6363 
 6364         return (0);
 6365 }
 6366 
 6367 /*
 6368  * Detach knote from vnode
 6369  */
 6370 static void
 6371 filt_vfsdetach(struct knote *kn)
 6372 {
 6373         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6374 
 6375         KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 6376         knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 6377         vdrop(vp);
 6378 }
 6379 
 6380 /*ARGSUSED*/
 6381 static int
 6382 filt_vfsread(struct knote *kn, long hint)
 6383 {
 6384         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6385         off_t size;
 6386         int res;
 6387 
 6388         /*
 6389          * filesystem is gone, so set the EOF flag and schedule
 6390          * the knote for deletion.
 6391          */
 6392         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 6393                 VI_LOCK(vp);
 6394                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 6395                 VI_UNLOCK(vp);
 6396                 return (1);
 6397         }
 6398 
 6399         if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0)
 6400                 return (0);
 6401 
 6402         VI_LOCK(vp);
 6403         kn->kn_data = size - kn->kn_fp->f_offset;
 6404         res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
 6405         VI_UNLOCK(vp);
 6406         return (res);
 6407 }
 6408 
 6409 /*ARGSUSED*/
 6410 static int
 6411 filt_vfswrite(struct knote *kn, long hint)
 6412 {
 6413         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6414 
 6415         VI_LOCK(vp);
 6416 
 6417         /*
 6418          * filesystem is gone, so set the EOF flag and schedule
 6419          * the knote for deletion.
 6420          */
 6421         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 6422                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 6423 
 6424         kn->kn_data = 0;
 6425         VI_UNLOCK(vp);
 6426         return (1);
 6427 }
 6428 
 6429 static int
 6430 filt_vfsvnode(struct knote *kn, long hint)
 6431 {
 6432         struct vnode *vp = (struct vnode *)kn->kn_hook;
 6433         int res;
 6434 
 6435         VI_LOCK(vp);
 6436         if (kn->kn_sfflags & hint)
 6437                 kn->kn_fflags |= hint;
 6438         if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 6439                 kn->kn_flags |= EV_EOF;
 6440                 VI_UNLOCK(vp);
 6441                 return (1);
 6442         }
 6443         res = (kn->kn_fflags != 0);
 6444         VI_UNLOCK(vp);
 6445         return (res);
 6446 }
 6447 
 6448 /*
 6449  * Returns whether the directory is empty or not.
 6450  * If it is empty, the return value is 0; otherwise
 6451  * the return value is an error value (which may
 6452  * be ENOTEMPTY).
 6453  */
 6454 int
 6455 vfs_emptydir(struct vnode *vp)
 6456 {
 6457         struct uio uio;
 6458         struct iovec iov;
 6459         struct dirent *dirent, *dp, *endp;
 6460         int error, eof;
 6461 
 6462         error = 0;
 6463         eof = 0;
 6464 
 6465         ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
 6466         VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory"));
 6467 
 6468         dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
 6469         iov.iov_base = dirent;
 6470         iov.iov_len = sizeof(struct dirent);
 6471 
 6472         uio.uio_iov = &iov;
 6473         uio.uio_iovcnt = 1;
 6474         uio.uio_offset = 0;
 6475         uio.uio_resid = sizeof(struct dirent);
 6476         uio.uio_segflg = UIO_SYSSPACE;
 6477         uio.uio_rw = UIO_READ;
 6478         uio.uio_td = curthread;
 6479 
 6480         while (eof == 0 && error == 0) {
 6481                 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
 6482                     NULL, NULL);
 6483                 if (error != 0)
 6484                         break;
 6485                 endp = (void *)((uint8_t *)dirent +
 6486                     sizeof(struct dirent) - uio.uio_resid);
 6487                 for (dp = dirent; dp < endp;
 6488                      dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
 6489                         if (dp->d_type == DT_WHT)
 6490                                 continue;
 6491                         if (dp->d_namlen == 0)
 6492                                 continue;
 6493                         if (dp->d_type != DT_DIR &&
 6494                             dp->d_type != DT_UNKNOWN) {
 6495                                 error = ENOTEMPTY;
 6496                                 break;
 6497                         }
 6498                         if (dp->d_namlen > 2) {
 6499                                 error = ENOTEMPTY;
 6500                                 break;
 6501                         }
 6502                         if (dp->d_namlen == 1 &&
 6503                             dp->d_name[0] != '.') {
 6504                                 error = ENOTEMPTY;
 6505                                 break;
 6506                         }
 6507                         if (dp->d_namlen == 2 &&
 6508                             dp->d_name[1] != '.') {
 6509                                 error = ENOTEMPTY;
 6510                                 break;
 6511                         }
 6512                         uio.uio_resid = sizeof(struct dirent);
 6513                 }
 6514         }
 6515         free(dirent, M_TEMP);
 6516         return (error);
 6517 }
 6518 
 6519 int
 6520 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 6521 {
 6522         int error;
 6523 
 6524         if (dp->d_reclen > ap->a_uio->uio_resid)
 6525                 return (ENAMETOOLONG);
 6526         error = uiomove(dp, dp->d_reclen, ap->a_uio);
 6527         if (error) {
 6528                 if (ap->a_ncookies != NULL) {
 6529                         if (ap->a_cookies != NULL)
 6530                                 free(ap->a_cookies, M_TEMP);
 6531                         ap->a_cookies = NULL;
 6532                         *ap->a_ncookies = 0;
 6533                 }
 6534                 return (error);
 6535         }
 6536         if (ap->a_ncookies == NULL)
 6537                 return (0);
 6538 
 6539         KASSERT(ap->a_cookies,
 6540             ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 6541 
 6542         *ap->a_cookies = realloc(*ap->a_cookies,
 6543             (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO);
 6544         (*ap->a_cookies)[*ap->a_ncookies] = off;
 6545         *ap->a_ncookies += 1;
 6546         return (0);
 6547 }
 6548 
 6549 /*
 6550  * The purpose of this routine is to remove granularity from accmode_t,
 6551  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
 6552  * VADMIN and VAPPEND.
 6553  *
 6554  * If it returns 0, the caller is supposed to continue with the usual
 6555  * access checks using 'accmode' as modified by this routine.  If it
 6556  * returns nonzero value, the caller is supposed to return that value
 6557  * as errno.
 6558  *
 6559  * Note that after this routine runs, accmode may be zero.
 6560  */
 6561 int
 6562 vfs_unixify_accmode(accmode_t *accmode)
 6563 {
 6564         /*
 6565          * There is no way to specify explicit "deny" rule using
 6566          * file mode or POSIX.1e ACLs.
 6567          */
 6568         if (*accmode & VEXPLICIT_DENY) {
 6569                 *accmode = 0;
 6570                 return (0);
 6571         }
 6572 
 6573         /*
 6574          * None of these can be translated into usual access bits.
 6575          * Also, the common case for NFSv4 ACLs is to not contain
 6576          * either of these bits. Caller should check for VWRITE
 6577          * on the containing directory instead.
 6578          */
 6579         if (*accmode & (VDELETE_CHILD | VDELETE))
 6580                 return (EPERM);
 6581 
 6582         if (*accmode & VADMIN_PERMS) {
 6583                 *accmode &= ~VADMIN_PERMS;
 6584                 *accmode |= VADMIN;
 6585         }
 6586 
 6587         /*
 6588          * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 6589          * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 6590          */
 6591         *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 6592 
 6593         return (0);
 6594 }
 6595 
 6596 /*
 6597  * Clear out a doomed vnode (if any) and replace it with a new one as long
 6598  * as the fs is not being unmounted. Return the root vnode to the caller.
 6599  */
 6600 static int __noinline
 6601 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
 6602 {
 6603         struct vnode *vp;
 6604         int error;
 6605 
 6606 restart:
 6607         if (mp->mnt_rootvnode != NULL) {
 6608                 MNT_ILOCK(mp);
 6609                 vp = mp->mnt_rootvnode;
 6610                 if (vp != NULL) {
 6611                         if (!VN_IS_DOOMED(vp)) {
 6612                                 vrefact(vp);
 6613                                 MNT_IUNLOCK(mp);
 6614                                 error = vn_lock(vp, flags);
 6615                                 if (error == 0) {
 6616                                         *vpp = vp;
 6617                                         return (0);
 6618                                 }
 6619                                 vrele(vp);
 6620                                 goto restart;
 6621                         }
 6622                         /*
 6623                          * Clear the old one.
 6624                          */
 6625                         mp->mnt_rootvnode = NULL;
 6626                 }
 6627                 MNT_IUNLOCK(mp);
 6628                 if (vp != NULL) {
 6629                         vfs_op_barrier_wait(mp);
 6630                         vrele(vp);
 6631                 }
 6632         }
 6633         error = VFS_CACHEDROOT(mp, flags, vpp);
 6634         if (error != 0)
 6635                 return (error);
 6636         if (mp->mnt_vfs_ops == 0) {
 6637                 MNT_ILOCK(mp);
 6638                 if (mp->mnt_vfs_ops != 0) {
 6639                         MNT_IUNLOCK(mp);
 6640                         return (0);
 6641                 }
 6642                 if (mp->mnt_rootvnode == NULL) {
 6643                         vrefact(*vpp);
 6644                         mp->mnt_rootvnode = *vpp;
 6645                 } else {
 6646                         if (mp->mnt_rootvnode != *vpp) {
 6647                                 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
 6648                                         panic("%s: mismatch between vnode returned "
 6649                                             " by VFS_CACHEDROOT and the one cached "
 6650                                             " (%p != %p)",
 6651                                             __func__, *vpp, mp->mnt_rootvnode);
 6652                                 }
 6653                         }
 6654                 }
 6655                 MNT_IUNLOCK(mp);
 6656         }
 6657         return (0);
 6658 }
 6659 
 6660 int
 6661 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
 6662 {
 6663         struct mount_pcpu *mpcpu;
 6664         struct vnode *vp;
 6665         int error;
 6666 
 6667         if (!vfs_op_thread_enter(mp, mpcpu))
 6668                 return (vfs_cache_root_fallback(mp, flags, vpp));
 6669         vp = atomic_load_ptr(&mp->mnt_rootvnode);
 6670         if (vp == NULL || VN_IS_DOOMED(vp)) {
 6671                 vfs_op_thread_exit(mp, mpcpu);
 6672                 return (vfs_cache_root_fallback(mp, flags, vpp));
 6673         }
 6674         vrefact(vp);
 6675         vfs_op_thread_exit(mp, mpcpu);
 6676         error = vn_lock(vp, flags);
 6677         if (error != 0) {
 6678                 vrele(vp);
 6679                 return (vfs_cache_root_fallback(mp, flags, vpp));
 6680         }
 6681         *vpp = vp;
 6682         return (0);
 6683 }
 6684 
 6685 struct vnode *
 6686 vfs_cache_root_clear(struct mount *mp)
 6687 {
 6688         struct vnode *vp;
 6689 
 6690         /*
 6691          * ops > 0 guarantees there is nobody who can see this vnode
 6692          */
 6693         MPASS(mp->mnt_vfs_ops > 0);
 6694         vp = mp->mnt_rootvnode;
 6695         if (vp != NULL)
 6696                 vn_seqc_write_begin(vp);
 6697         mp->mnt_rootvnode = NULL;
 6698         return (vp);
 6699 }
 6700 
 6701 void
 6702 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
 6703 {
 6704 
 6705         MPASS(mp->mnt_vfs_ops > 0);
 6706         vrefact(vp);
 6707         mp->mnt_rootvnode = vp;
 6708 }
 6709 
 6710 /*
 6711  * These are helper functions for filesystems to traverse all
 6712  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
 6713  *
 6714  * This interface replaces MNT_VNODE_FOREACH.
 6715  */
 6716 
 6717 struct vnode *
 6718 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 6719 {
 6720         struct vnode *vp;
 6721 
 6722         if (should_yield())
 6723                 kern_yield(PRI_USER);
 6724         MNT_ILOCK(mp);
 6725         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6726         for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
 6727             vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
 6728                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 6729                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 6730                         continue;
 6731                 VI_LOCK(vp);
 6732                 if (VN_IS_DOOMED(vp)) {
 6733                         VI_UNLOCK(vp);
 6734                         continue;
 6735                 }
 6736                 break;
 6737         }
 6738         if (vp == NULL) {
 6739                 __mnt_vnode_markerfree_all(mvp, mp);
 6740                 /* MNT_IUNLOCK(mp); -- done in above function */
 6741                 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 6742                 return (NULL);
 6743         }
 6744         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 6745         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 6746         MNT_IUNLOCK(mp);
 6747         return (vp);
 6748 }
 6749 
 6750 struct vnode *
 6751 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 6752 {
 6753         struct vnode *vp;
 6754 
 6755         *mvp = vn_alloc_marker(mp);
 6756         MNT_ILOCK(mp);
 6757         MNT_REF(mp);
 6758 
 6759         TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 6760                 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 6761                 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 6762                         continue;
 6763                 VI_LOCK(vp);
 6764                 if (VN_IS_DOOMED(vp)) {
 6765                         VI_UNLOCK(vp);
 6766                         continue;
 6767                 }
 6768                 break;
 6769         }
 6770         if (vp == NULL) {
 6771                 MNT_REL(mp);
 6772                 MNT_IUNLOCK(mp);
 6773                 vn_free_marker(*mvp);
 6774                 *mvp = NULL;
 6775                 return (NULL);
 6776         }
 6777         TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 6778         MNT_IUNLOCK(mp);
 6779         return (vp);
 6780 }
 6781 
 6782 void
 6783 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 6784 {
 6785 
 6786         if (*mvp == NULL) {
 6787                 MNT_IUNLOCK(mp);
 6788                 return;
 6789         }
 6790 
 6791         mtx_assert(MNT_MTX(mp), MA_OWNED);
 6792 
 6793         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6794         TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 6795         MNT_REL(mp);
 6796         MNT_IUNLOCK(mp);
 6797         vn_free_marker(*mvp);
 6798         *mvp = NULL;
 6799 }
 6800 
 6801 /*
 6802  * These are helper functions for filesystems to traverse their
 6803  * lazy vnodes.  See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
 6804  */
 6805 static void
 6806 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 6807 {
 6808 
 6809         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6810 
 6811         MNT_ILOCK(mp);
 6812         MNT_REL(mp);
 6813         MNT_IUNLOCK(mp);
 6814         vn_free_marker(*mvp);
 6815         *mvp = NULL;
 6816 }
 6817 
 6818 /*
 6819  * Relock the mp mount vnode list lock with the vp vnode interlock in the
 6820  * conventional lock order during mnt_vnode_next_lazy iteration.
 6821  *
 6822  * On entry, the mount vnode list lock is held and the vnode interlock is not.
 6823  * The list lock is dropped and reacquired.  On success, both locks are held.
 6824  * On failure, the mount vnode list lock is held but the vnode interlock is
 6825  * not, and the procedure may have yielded.
 6826  */
 6827 static bool
 6828 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
 6829     struct vnode *vp)
 6830 {
 6831 
 6832         VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
 6833             TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
 6834             ("%s: bad marker", __func__));
 6835         VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
 6836             ("%s: inappropriate vnode", __func__));
 6837         ASSERT_VI_UNLOCKED(vp, __func__);
 6838         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 6839 
 6840         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
 6841         TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
 6842 
 6843         /*
 6844          * Note we may be racing against vdrop which transitioned the hold
 6845          * count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
 6846          * if we are the only user after we get the interlock we will just
 6847          * vdrop.
 6848          */
 6849         vhold(vp);
 6850         mtx_unlock(&mp->mnt_listmtx);
 6851         VI_LOCK(vp);
 6852         if (VN_IS_DOOMED(vp)) {
 6853                 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
 6854                 goto out_lost;
 6855         }
 6856         VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
 6857         /*
 6858          * There is nothing to do if we are the last user.
 6859          */
 6860         if (!refcount_release_if_not_last(&vp->v_holdcnt))
 6861                 goto out_lost;
 6862         mtx_lock(&mp->mnt_listmtx);
 6863         return (true);
 6864 out_lost:
 6865         vdropl(vp);
 6866         maybe_yield();
 6867         mtx_lock(&mp->mnt_listmtx);
 6868         return (false);
 6869 }
 6870 
 6871 static struct vnode *
 6872 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
 6873     void *cbarg)
 6874 {
 6875         struct vnode *vp;
 6876 
 6877         mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 6878         KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 6879 restart:
 6880         vp = TAILQ_NEXT(*mvp, v_lazylist);
 6881         while (vp != NULL) {
 6882                 if (vp->v_type == VMARKER) {
 6883                         vp = TAILQ_NEXT(vp, v_lazylist);
 6884                         continue;
 6885                 }
 6886                 /*
 6887                  * See if we want to process the vnode. Note we may encounter a
 6888                  * long string of vnodes we don't care about and hog the list
 6889                  * as a result. Check for it and requeue the marker.
 6890                  */
 6891                 VNPASS(!VN_IS_DOOMED(vp), vp);
 6892                 if (!cb(vp, cbarg)) {
 6893                         if (!should_yield()) {
 6894                                 vp = TAILQ_NEXT(vp, v_lazylist);
 6895                                 continue;
 6896                         }
 6897                         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
 6898                             v_lazylist);
 6899                         TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
 6900                             v_lazylist);
 6901                         mtx_unlock(&mp->mnt_listmtx);
 6902                         kern_yield(PRI_USER);
 6903                         mtx_lock(&mp->mnt_listmtx);
 6904                         goto restart;
 6905                 }
 6906                 /*
 6907                  * Try-lock because this is the wrong lock order.
 6908                  */
 6909                 if (!VI_TRYLOCK(vp) &&
 6910                     !mnt_vnode_next_lazy_relock(*mvp, mp, vp))
 6911                         goto restart;
 6912                 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 6913                 KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 6914                     ("alien vnode on the lazy list %p %p", vp, mp));
 6915                 VNPASS(vp->v_mount == mp, vp);
 6916                 VNPASS(!VN_IS_DOOMED(vp), vp);
 6917                 break;
 6918         }
 6919         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 6920 
 6921         /* Check if we are done */
 6922         if (vp == NULL) {
 6923                 mtx_unlock(&mp->mnt_listmtx);
 6924                 mnt_vnode_markerfree_lazy(mvp, mp);
 6925                 return (NULL);
 6926         }
 6927         TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
 6928         mtx_unlock(&mp->mnt_listmtx);
 6929         ASSERT_VI_LOCKED(vp, "lazy iter");
 6930         return (vp);
 6931 }
 6932 
 6933 struct vnode *
 6934 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
 6935     void *cbarg)
 6936 {
 6937 
 6938         if (should_yield())
 6939                 kern_yield(PRI_USER);
 6940         mtx_lock(&mp->mnt_listmtx);
 6941         return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 6942 }
 6943 
 6944 struct vnode *
 6945 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
 6946     void *cbarg)
 6947 {
 6948         struct vnode *vp;
 6949 
 6950         if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
 6951                 return (NULL);
 6952 
 6953         *mvp = vn_alloc_marker(mp);
 6954         MNT_ILOCK(mp);
 6955         MNT_REF(mp);
 6956         MNT_IUNLOCK(mp);
 6957 
 6958         mtx_lock(&mp->mnt_listmtx);
 6959         vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
 6960         if (vp == NULL) {
 6961                 mtx_unlock(&mp->mnt_listmtx);
 6962                 mnt_vnode_markerfree_lazy(mvp, mp);
 6963                 return (NULL);
 6964         }
 6965         TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
 6966         return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
 6967 }
 6968 
 6969 void
 6970 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
 6971 {
 6972 
 6973         if (*mvp == NULL)
 6974                 return;
 6975 
 6976         mtx_lock(&mp->mnt_listmtx);
 6977         TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
 6978         mtx_unlock(&mp->mnt_listmtx);
 6979         mnt_vnode_markerfree_lazy(mvp, mp);
 6980 }
 6981 
 6982 int
 6983 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
 6984 {
 6985 
 6986         if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 6987                 cnp->cn_flags &= ~NOEXECCHECK;
 6988                 return (0);
 6989         }
 6990 
 6991         return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread));
 6992 }
 6993 
 6994 /*
 6995  * Do not use this variant unless you have means other than the hold count
 6996  * to prevent the vnode from getting freed.
 6997  */
 6998 void
 6999 vn_seqc_write_begin_locked(struct vnode *vp)
 7000 {
 7001 
 7002         ASSERT_VI_LOCKED(vp, __func__);
 7003         VNPASS(vp->v_holdcnt > 0, vp);
 7004         VNPASS(vp->v_seqc_users >= 0, vp);
 7005         vp->v_seqc_users++;
 7006         if (vp->v_seqc_users == 1)
 7007                 seqc_sleepable_write_begin(&vp->v_seqc);
 7008 }
 7009 
 7010 void
 7011 vn_seqc_write_begin(struct vnode *vp)
 7012 {
 7013 
 7014         VI_LOCK(vp);
 7015         vn_seqc_write_begin_locked(vp);
 7016         VI_UNLOCK(vp);
 7017 }
 7018 
 7019 void
 7020 vn_seqc_write_end_locked(struct vnode *vp)
 7021 {
 7022 
 7023         ASSERT_VI_LOCKED(vp, __func__);
 7024         VNPASS(vp->v_seqc_users > 0, vp);
 7025         vp->v_seqc_users--;
 7026         if (vp->v_seqc_users == 0)
 7027                 seqc_sleepable_write_end(&vp->v_seqc);
 7028 }
 7029 
 7030 void
 7031 vn_seqc_write_end(struct vnode *vp)
 7032 {
 7033 
 7034         VI_LOCK(vp);
 7035         vn_seqc_write_end_locked(vp);
 7036         VI_UNLOCK(vp);
 7037 }
 7038 
 7039 /*
 7040  * Special case handling for allocating and freeing vnodes.
 7041  *
 7042  * The counter remains unchanged on free so that a doomed vnode will
 7043  * keep testing as in modify as long as it is accessible with SMR.
 7044  */
 7045 static void
 7046 vn_seqc_init(struct vnode *vp)
 7047 {
 7048 
 7049         vp->v_seqc = 0;
 7050         vp->v_seqc_users = 0;
 7051 }
 7052 
 7053 static void
 7054 vn_seqc_write_end_free(struct vnode *vp)
 7055 {
 7056 
 7057         VNPASS(seqc_in_modify(vp->v_seqc), vp);
 7058         VNPASS(vp->v_seqc_users == 1, vp);
 7059 }
 7060 
 7061 void
 7062 vn_irflag_set_locked(struct vnode *vp, short toset)
 7063 {
 7064         short flags;
 7065 
 7066         ASSERT_VI_LOCKED(vp, __func__);
 7067         flags = vn_irflag_read(vp);
 7068         VNASSERT((flags & toset) == 0, vp,
 7069             ("%s: some of the passed flags already set (have %d, passed %d)\n",
 7070             __func__, flags, toset));
 7071         atomic_store_short(&vp->v_irflag, flags | toset);
 7072 }
 7073 
 7074 void
 7075 vn_irflag_set(struct vnode *vp, short toset)
 7076 {
 7077 
 7078         VI_LOCK(vp);
 7079         vn_irflag_set_locked(vp, toset);
 7080         VI_UNLOCK(vp);
 7081 }
 7082 
 7083 void
 7084 vn_irflag_set_cond_locked(struct vnode *vp, short toset)
 7085 {
 7086         short flags;
 7087 
 7088         ASSERT_VI_LOCKED(vp, __func__);
 7089         flags = vn_irflag_read(vp);
 7090         atomic_store_short(&vp->v_irflag, flags | toset);
 7091 }
 7092 
 7093 void
 7094 vn_irflag_set_cond(struct vnode *vp, short toset)
 7095 {
 7096 
 7097         VI_LOCK(vp);
 7098         vn_irflag_set_cond_locked(vp, toset);
 7099         VI_UNLOCK(vp);
 7100 }
 7101 
 7102 void
 7103 vn_irflag_unset_locked(struct vnode *vp, short tounset)
 7104 {
 7105         short flags;
 7106 
 7107         ASSERT_VI_LOCKED(vp, __func__);
 7108         flags = vn_irflag_read(vp);
 7109         VNASSERT((flags & tounset) == tounset, vp,
 7110             ("%s: some of the passed flags not set (have %d, passed %d)\n",
 7111             __func__, flags, tounset));
 7112         atomic_store_short(&vp->v_irflag, flags & ~tounset);
 7113 }
 7114 
 7115 void
 7116 vn_irflag_unset(struct vnode *vp, short tounset)
 7117 {
 7118 
 7119         VI_LOCK(vp);
 7120         vn_irflag_unset_locked(vp, tounset);
 7121         VI_UNLOCK(vp);
 7122 }
 7123 
 7124 int
 7125 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred)
 7126 {
 7127         struct vattr vattr;
 7128         int error;
 7129 
 7130         ASSERT_VOP_LOCKED(vp, __func__);
 7131         error = VOP_GETATTR(vp, &vattr, cred);
 7132         if (__predict_true(error == 0)) {
 7133                 if (vattr.va_size <= OFF_MAX)
 7134                         *size = vattr.va_size;
 7135                 else
 7136                         error = EFBIG;
 7137         }
 7138         return (error);
 7139 }
 7140 
 7141 int
 7142 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred)
 7143 {
 7144         int error;
 7145 
 7146         VOP_LOCK(vp, LK_SHARED);
 7147         error = vn_getsize_locked(vp, size, cred);
 7148         VOP_UNLOCK(vp);
 7149         return (error);
 7150 }
 7151 
 7152 #ifdef INVARIANTS
 7153 void
 7154 vn_set_state_validate(struct vnode *vp, enum vstate state)
 7155 {
 7156 
 7157         switch (vp->v_state) {
 7158         case VSTATE_UNINITIALIZED:
 7159                 switch (state) {
 7160                 case VSTATE_CONSTRUCTED:
 7161                 case VSTATE_DESTROYING:
 7162                         return;
 7163                 default:
 7164                         break;
 7165                 }
 7166                 break;
 7167         case VSTATE_CONSTRUCTED:
 7168                 ASSERT_VOP_ELOCKED(vp, __func__);
 7169                 switch (state) {
 7170                 case VSTATE_DESTROYING:
 7171                         return;
 7172                 default:
 7173                         break;
 7174                 }
 7175                 break;
 7176         case VSTATE_DESTROYING:
 7177                 ASSERT_VOP_ELOCKED(vp, __func__);
 7178                 switch (state) {
 7179                 case VSTATE_DEAD:
 7180                         return;
 7181                 default:
 7182                         break;
 7183                 }
 7184                 break;
 7185         case VSTATE_DEAD:
 7186                 switch (state) {
 7187                 case VSTATE_UNINITIALIZED:
 7188                         return;
 7189                 default:
 7190                         break;
 7191                 }
 7192                 break;
 7193         }
 7194 
 7195         vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state);
 7196         panic("invalid state transition %d -> %d\n", vp->v_state, state);
 7197 }
 7198 #endif

Cache object: 99ea1e3b1fa50c5371a8ec701becce46


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.