The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnode.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: vfs_vnode.c,v 1.147 2022/10/26 23:40:08 riastradh Exp $        */
    2 
    3 /*-
    4  * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
    9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   30  * POSSIBILITY OF SUCH DAMAGE.
   31  */
   32 
   33 /*
   34  * Copyright (c) 1989, 1993
   35  *      The Regents of the University of California.  All rights reserved.
   36  * (c) UNIX System Laboratories, Inc.
   37  * All or some portions of this file are derived from material licensed
   38  * to the University of California by American Telephone and Telegraph
   39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   40  * the permission of UNIX System Laboratories, Inc.
   41  *
   42  * Redistribution and use in source and binary forms, with or without
   43  * modification, are permitted provided that the following conditions
   44  * are met:
   45  * 1. Redistributions of source code must retain the above copyright
   46  *    notice, this list of conditions and the following disclaimer.
   47  * 2. Redistributions in binary form must reproduce the above copyright
   48  *    notice, this list of conditions and the following disclaimer in the
   49  *    documentation and/or other materials provided with the distribution.
   50  * 3. Neither the name of the University nor the names of its contributors
   51  *    may be used to endorse or promote products derived from this software
   52  *    without specific prior written permission.
   53  *
   54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   64  * SUCH DAMAGE.
   65  *
   66  *      @(#)vfs_subr.c  8.13 (Berkeley) 4/18/94
   67  */
   68 
   69 /*
   70  * The vnode cache subsystem.
   71  *
   72  * Life-cycle
   73  *
   74  *      Normally, there are two points where new vnodes are created:
   75  *      VOP_CREATE(9) and VOP_LOOKUP(9).  The life-cycle of a vnode
   76  *      starts in one of the following ways:
   77  *
   78  *      - Allocation, via vcache_get(9) or vcache_new(9).
   79  *      - Reclamation of inactive vnode, via vcache_vget(9).
   80  *
   81  *      Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
   82  *      was another, traditional way.  Currently, only the draining thread
   83  *      recycles the vnodes.  This behaviour might be revisited.
   84  *
   85  *      The life-cycle ends when the last reference is dropped, usually
   86  *      in VOP_REMOVE(9).  In such case, VOP_INACTIVE(9) is called to inform
   87  *      the file system that vnode is inactive.  Via this call, file system
   88  *      indicates whether vnode can be recycled (usually, it checks its own
   89  *      references, e.g. count of links, whether the file was removed).
   90  *
   91  *      Depending on indication, vnode can be put into a free list (cache),
   92  *      or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
   93  *      disassociate underlying file system from the vnode, and finally
   94  *      destroyed.
   95  *
   96  * Vnode state
   97  *
   98  *      Vnode is always in one of six states:
   99  *      - MARKER        This is a marker vnode to help list traversal.  It
  100  *                      will never change its state.
  101  *      - LOADING       Vnode is associating underlying file system and not
  102  *                      yet ready to use.
  103  *      - LOADED        Vnode has associated underlying file system and is
  104  *                      ready to use.
  105  *      - BLOCKED       Vnode is active but cannot get new references.
  106  *      - RECLAIMING    Vnode is disassociating from the underlying file
  107  *                      system.
  108  *      - RECLAIMED     Vnode has disassociated from underlying file system
  109  *                      and is dead.
  110  *
  111  *      Valid state changes are:
  112  *      LOADING -> LOADED
  113  *                      Vnode has been initialised in vcache_get() or
  114  *                      vcache_new() and is ready to use.
  115  *      BLOCKED -> RECLAIMING
  116  *                      Vnode starts disassociation from underlying file
  117  *                      system in vcache_reclaim().
  118  *      RECLAIMING -> RECLAIMED
  119  *                      Vnode finished disassociation from underlying file
  120  *                      system in vcache_reclaim().
  121  *      LOADED -> BLOCKED
  122  *                      Either vcache_rekey*() is changing the vnode key or
  123  *                      vrelel() is about to call VOP_INACTIVE().
  124  *      BLOCKED -> LOADED
  125  *                      The block condition is over.
  126  *      LOADING -> RECLAIMED
  127  *                      Either vcache_get() or vcache_new() failed to
  128  *                      associate the underlying file system or vcache_rekey*()
  129  *                      drops a vnode used as placeholder.
  130  *
  131  *      Of these states LOADING, BLOCKED and RECLAIMING are intermediate
  132  *      and it is possible to wait for state change.
  133  *
  134  *      State is protected with v_interlock with one exception:
  135  *      to change from LOADING both v_interlock and vcache_lock must be held
  136  *      so it is possible to check "state == LOADING" without holding
  137  *      v_interlock.  See vcache_get() for details.
  138  *
  139  * Reference counting
  140  *
  141  *      Vnode is considered active, if reference count (vnode_t::v_usecount)
  142  *      is non-zero.  It is maintained using: vref(9) and vrele(9), as well
  143  *      as vput(9), routines.  Common points holding references are e.g.
  144  *      file openings, current working directory, mount points, etc.
  145  *
  146  *      v_usecount is adjusted with atomic operations, however to change
  147  *      from a non-zero value to zero the interlock must also be held.
  148  */
  149 
  150 #include <sys/cdefs.h>
  151 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.147 2022/10/26 23:40:08 riastradh Exp $");
  152 
  153 #ifdef _KERNEL_OPT
  154 #include "opt_pax.h"
  155 #endif
  156 
  157 #include <sys/param.h>
  158 #include <sys/kernel.h>
  159 
  160 #include <sys/atomic.h>
  161 #include <sys/buf.h>
  162 #include <sys/conf.h>
  163 #include <sys/device.h>
  164 #include <sys/hash.h>
  165 #include <sys/kauth.h>
  166 #include <sys/kmem.h>
  167 #include <sys/kthread.h>
  168 #include <sys/module.h>
  169 #include <sys/mount.h>
  170 #include <sys/namei.h>
  171 #include <sys/pax.h>
  172 #include <sys/syscallargs.h>
  173 #include <sys/sysctl.h>
  174 #include <sys/systm.h>
  175 #include <sys/vnode_impl.h>
  176 #include <sys/wapbl.h>
  177 #include <sys/fstrans.h>
  178 
  179 #include <miscfs/deadfs/deadfs.h>
  180 #include <miscfs/specfs/specdev.h>
  181 
  182 #include <uvm/uvm.h>
  183 #include <uvm/uvm_readahead.h>
  184 #include <uvm/uvm_stat.h>
  185 
  186 /* Flags to vrelel. */
  187 #define VRELEL_ASYNC    0x0001  /* Always defer to vrele thread. */
  188 
  189 #define LRU_VRELE       0
  190 #define LRU_FREE        1
  191 #define LRU_HOLD        2
  192 #define LRU_COUNT       3
  193 
  194 /*
  195  * There are three lru lists: one holds vnodes waiting for async release,
  196  * one is for vnodes which have no buffer/page references and one for those
  197  * which do (i.e.  v_holdcnt is non-zero).  We put the lists into a single,
  198  * private cache line as vnodes migrate between them while under the same
  199  * lock (vdrain_lock).
  200  */
  201 u_int                   numvnodes               __cacheline_aligned;
  202 static vnodelst_t       lru_list[LRU_COUNT]     __cacheline_aligned;
  203 static kmutex_t         vdrain_lock             __cacheline_aligned;
  204 static kcondvar_t       vdrain_cv;
  205 static int              vdrain_gen;
  206 static kcondvar_t       vdrain_gen_cv;
  207 static bool             vdrain_retry;
  208 static lwp_t *          vdrain_lwp;
  209 SLIST_HEAD(hashhead, vnode_impl);
  210 static kmutex_t         vcache_lock             __cacheline_aligned;
  211 static kcondvar_t       vcache_cv;
  212 static u_int            vcache_hashsize;
  213 static u_long           vcache_hashmask;
  214 static struct hashhead  *vcache_hashtab;
  215 static pool_cache_t     vcache_pool;
  216 static void             lru_requeue(vnode_t *, vnodelst_t *);
  217 static vnodelst_t *     lru_which(vnode_t *);
  218 static vnode_impl_t *   vcache_alloc(void);
  219 static void             vcache_dealloc(vnode_impl_t *);
  220 static void             vcache_free(vnode_impl_t *);
  221 static void             vcache_init(void);
  222 static void             vcache_reinit(void);
  223 static void             vcache_reclaim(vnode_t *);
  224 static void             vrelel(vnode_t *, int, int);
  225 static void             vdrain_thread(void *);
  226 static void             vnpanic(vnode_t *, const char *, ...)
  227     __printflike(2, 3);
  228 
  229 /* Routines having to do with the management of the vnode table. */
  230 
  231 /*
  232  * The high bit of v_usecount is a gate for vcache_tryvget().  It's set
  233  * only when the vnode state is LOADED.
  234  * The next bit of v_usecount is a flag for vrelel().  It's set
  235  * from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
  236  */
  237 #define VUSECOUNT_MASK  0x3fffffff
  238 #define VUSECOUNT_GATE  0x80000000
  239 #define VUSECOUNT_VGET  0x40000000
  240 
  241 /*
  242  * Return the current usecount of a vnode.
  243  */
  244 inline int
  245 vrefcnt(struct vnode *vp)
  246 {
  247 
  248         return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
  249 }
  250 
  251 /* Vnode state operations and diagnostics. */
  252 
  253 #if defined(DIAGNOSTIC)
  254 
  255 #define VSTATE_VALID(state) \
  256         ((state) != VS_ACTIVE && (state) != VS_MARKER)
  257 #define VSTATE_GET(vp) \
  258         vstate_assert_get((vp), __func__, __LINE__)
  259 #define VSTATE_CHANGE(vp, from, to) \
  260         vstate_assert_change((vp), (from), (to), __func__, __LINE__)
  261 #define VSTATE_WAIT_STABLE(vp) \
  262         vstate_assert_wait_stable((vp), __func__, __LINE__)
  263 
  264 void
  265 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
  266     bool has_lock)
  267 {
  268         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
  269         int refcnt = vrefcnt(vp);
  270 
  271         if (!has_lock) {
  272                 /*
  273                  * Prevent predictive loads from the CPU, but check the state
  274                  * without loooking first.
  275                  *
  276                  * XXX what does this pair with?
  277                  */
  278                 membar_enter();
  279                 if (state == VS_ACTIVE && refcnt > 0 &&
  280                     (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED))
  281                         return;
  282                 if (vip->vi_state == state)
  283                         return;
  284                 mutex_enter((vp)->v_interlock);
  285         }
  286 
  287         KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
  288 
  289         if ((state == VS_ACTIVE && refcnt > 0 &&
  290             (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
  291             vip->vi_state == state) {
  292                 if (!has_lock)
  293                         mutex_exit((vp)->v_interlock);
  294                 return;
  295         }
  296         vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
  297             vstate_name(vip->vi_state), refcnt,
  298             vstate_name(state), func, line);
  299 }
  300 
  301 static enum vnode_state
  302 vstate_assert_get(vnode_t *vp, const char *func, int line)
  303 {
  304         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
  305 
  306         KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
  307         if (! VSTATE_VALID(vip->vi_state))
  308                 vnpanic(vp, "state is %s at %s:%d",
  309                     vstate_name(vip->vi_state), func, line);
  310 
  311         return vip->vi_state;
  312 }
  313 
  314 static void
  315 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
  316 {
  317         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
  318 
  319         KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
  320         if (! VSTATE_VALID(vip->vi_state))
  321                 vnpanic(vp, "state is %s at %s:%d",
  322                     vstate_name(vip->vi_state), func, line);
  323 
  324         while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
  325                 cv_wait(&vp->v_cv, vp->v_interlock);
  326 
  327         if (! VSTATE_VALID(vip->vi_state))
  328                 vnpanic(vp, "state is %s at %s:%d",
  329                     vstate_name(vip->vi_state), func, line);
  330 }
  331 
  332 static void
  333 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
  334     const char *func, int line)
  335 {
  336         bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
  337         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
  338 
  339         KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
  340         if (from == VS_LOADING)
  341                 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
  342 
  343         if (! VSTATE_VALID(from))
  344                 vnpanic(vp, "from is %s at %s:%d",
  345                     vstate_name(from), func, line);
  346         if (! VSTATE_VALID(to))
  347                 vnpanic(vp, "to is %s at %s:%d",
  348                     vstate_name(to), func, line);
  349         if (vip->vi_state != from)
  350                 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
  351                     vstate_name(vip->vi_state), vstate_name(from), func, line);
  352         if ((from == VS_LOADED) != gated)
  353                 vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
  354                     vstate_name(vip->vi_state), gated, func, line);
  355 
  356         /* Open/close the gate for vcache_tryvget(). */
  357         if (to == VS_LOADED) {
  358 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  359                 membar_release();
  360 #endif
  361                 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
  362         } else {
  363                 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
  364         }
  365 
  366         vip->vi_state = to;
  367         if (from == VS_LOADING)
  368                 cv_broadcast(&vcache_cv);
  369         if (to == VS_LOADED || to == VS_RECLAIMED)
  370                 cv_broadcast(&vp->v_cv);
  371 }
  372 
  373 #else /* defined(DIAGNOSTIC) */
  374 
  375 #define VSTATE_GET(vp) \
  376         (VNODE_TO_VIMPL((vp))->vi_state)
  377 #define VSTATE_CHANGE(vp, from, to) \
  378         vstate_change((vp), (from), (to))
  379 #define VSTATE_WAIT_STABLE(vp) \
  380         vstate_wait_stable((vp))
  381 void
  382 _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
  383     bool has_lock)
  384 {
  385 
  386 }
  387 
  388 static void
  389 vstate_wait_stable(vnode_t *vp)
  390 {
  391         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
  392 
  393         while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
  394                 cv_wait(&vp->v_cv, vp->v_interlock);
  395 }
  396 
  397 static void
  398 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
  399 {
  400         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
  401 
  402         /* Open/close the gate for vcache_tryvget(). */
  403         if (to == VS_LOADED) {
  404 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  405                 membar_release();
  406 #endif
  407                 atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
  408         } else {
  409                 atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
  410         }
  411 
  412         vip->vi_state = to;
  413         if (from == VS_LOADING)
  414                 cv_broadcast(&vcache_cv);
  415         if (to == VS_LOADED || to == VS_RECLAIMED)
  416                 cv_broadcast(&vp->v_cv);
  417 }
  418 
  419 #endif /* defined(DIAGNOSTIC) */
  420 
  421 void
  422 vfs_vnode_sysinit(void)
  423 {
  424         int error __diagused, i;
  425 
  426         dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
  427         KASSERT(dead_rootmount != NULL);
  428         dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
  429 
  430         mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
  431         for (i = 0; i < LRU_COUNT; i++) {
  432                 TAILQ_INIT(&lru_list[i]);
  433         }
  434         vcache_init();
  435 
  436         cv_init(&vdrain_cv, "vdrain");
  437         cv_init(&vdrain_gen_cv, "vdrainwt");
  438         error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
  439             NULL, &vdrain_lwp, "vdrain");
  440         KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
  441 }
  442 
  443 /*
  444  * Allocate a new marker vnode.
  445  */
  446 vnode_t *
  447 vnalloc_marker(struct mount *mp)
  448 {
  449         vnode_impl_t *vip;
  450         vnode_t *vp;
  451 
  452         vip = pool_cache_get(vcache_pool, PR_WAITOK);
  453         memset(vip, 0, sizeof(*vip));
  454         vp = VIMPL_TO_VNODE(vip);
  455         uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
  456         vp->v_mount = mp;
  457         vp->v_type = VBAD;
  458         vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
  459         klist_init(&vip->vi_klist.vk_klist);
  460         vp->v_klist = &vip->vi_klist;
  461         vip->vi_state = VS_MARKER;
  462 
  463         return vp;
  464 }
  465 
  466 /*
  467  * Free a marker vnode.
  468  */
  469 void
  470 vnfree_marker(vnode_t *vp)
  471 {
  472         vnode_impl_t *vip;
  473 
  474         vip = VNODE_TO_VIMPL(vp);
  475         KASSERT(vip->vi_state == VS_MARKER);
  476         mutex_obj_free(vp->v_interlock);
  477         uvm_obj_destroy(&vp->v_uobj, true);
  478         klist_fini(&vip->vi_klist.vk_klist);
  479         pool_cache_put(vcache_pool, vip);
  480 }
  481 
  482 /*
  483  * Test a vnode for being a marker vnode.
  484  */
  485 bool
  486 vnis_marker(vnode_t *vp)
  487 {
  488 
  489         return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
  490 }
  491 
  492 /*
  493  * Return the lru list this node should be on.
  494  */
  495 static vnodelst_t *
  496 lru_which(vnode_t *vp)
  497 {
  498 
  499         KASSERT(mutex_owned(vp->v_interlock));
  500 
  501         if (vp->v_holdcnt > 0)
  502                 return &lru_list[LRU_HOLD];
  503         else
  504                 return &lru_list[LRU_FREE];
  505 }
  506 
  507 /*
  508  * Put vnode to end of given list.
  509  * Both the current and the new list may be NULL, used on vnode alloc/free.
  510  * Adjust numvnodes and signal vdrain thread if there is work.
  511  */
  512 static void
  513 lru_requeue(vnode_t *vp, vnodelst_t *listhd)
  514 {
  515         vnode_impl_t *vip;
  516         int d;
  517 
  518         /*
  519          * If the vnode is on the correct list, and was put there recently,
  520          * then leave it be, thus avoiding huge cache and lock contention.
  521          */
  522         vip = VNODE_TO_VIMPL(vp);
  523         if (listhd == vip->vi_lrulisthd &&
  524             (getticks() - vip->vi_lrulisttm) < hz) {
  525                 return;
  526         }
  527 
  528         mutex_enter(&vdrain_lock);
  529         d = 0;
  530         if (vip->vi_lrulisthd != NULL)
  531                 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
  532         else
  533                 d++;
  534         vip->vi_lrulisthd = listhd;
  535         vip->vi_lrulisttm = getticks();
  536         if (vip->vi_lrulisthd != NULL)
  537                 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
  538         else
  539                 d--;
  540         if (d != 0) {
  541                 /*
  542                  * Looks strange?  This is not a bug.  Don't store
  543                  * numvnodes unless there is a change - avoid false
  544                  * sharing on MP.
  545                  */
  546                 numvnodes += d;
  547         }
  548         if ((d > 0 && numvnodes > desiredvnodes) ||
  549             listhd == &lru_list[LRU_VRELE])
  550                 cv_signal(&vdrain_cv);
  551         mutex_exit(&vdrain_lock);
  552 }
  553 
  554 /*
  555  * Release deferred vrele vnodes for this mount.
  556  * Called with file system suspended.
  557  */
  558 void
  559 vrele_flush(struct mount *mp)
  560 {
  561         vnode_impl_t *vip, *marker;
  562         vnode_t *vp;
  563         int when = 0;
  564 
  565         KASSERT(fstrans_is_owner(mp));
  566 
  567         marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
  568 
  569         mutex_enter(&vdrain_lock);
  570         TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist);
  571 
  572         while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
  573                 TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
  574                 TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker,
  575                     vi_lrulist);
  576                 vp = VIMPL_TO_VNODE(vip);
  577                 if (vnis_marker(vp))
  578                         continue;
  579 
  580                 KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
  581                 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
  582                 vip->vi_lrulisthd = &lru_list[LRU_HOLD];
  583                 vip->vi_lrulisttm = getticks();
  584                 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
  585                 mutex_exit(&vdrain_lock);
  586 
  587                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  588                 mutex_enter(vp->v_interlock);
  589                 vrelel(vp, 0, LK_EXCLUSIVE);
  590 
  591                 if (getticks() > when) {
  592                         yield();
  593                         when = getticks() + hz / 10;
  594                 }
  595 
  596                 mutex_enter(&vdrain_lock);
  597         }
  598 
  599         TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist);
  600         mutex_exit(&vdrain_lock);
  601 
  602         vnfree_marker(VIMPL_TO_VNODE(marker));
  603 }
  604 
  605 /*
  606  * Reclaim a cached vnode.  Used from vdrain_thread only.
  607  */
  608 static __inline void
  609 vdrain_remove(vnode_t *vp)
  610 {
  611         struct mount *mp;
  612 
  613         KASSERT(mutex_owned(&vdrain_lock));
  614 
  615         /* Probe usecount (unlocked). */
  616         if (vrefcnt(vp) > 0)
  617                 return;
  618         /* Try v_interlock -- we lock the wrong direction! */
  619         if (!mutex_tryenter(vp->v_interlock))
  620                 return;
  621         /* Probe usecount and state. */
  622         if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
  623                 mutex_exit(vp->v_interlock);
  624                 return;
  625         }
  626         mp = vp->v_mount;
  627         if (fstrans_start_nowait(mp) != 0) {
  628                 mutex_exit(vp->v_interlock);
  629                 return;
  630         }
  631         vdrain_retry = true;
  632         mutex_exit(&vdrain_lock);
  633 
  634         if (vcache_vget(vp) == 0) {
  635                 if (!vrecycle(vp)) {
  636                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  637                         mutex_enter(vp->v_interlock);
  638                         vrelel(vp, 0, LK_EXCLUSIVE);
  639                 }
  640         }
  641         fstrans_done(mp);
  642 
  643         mutex_enter(&vdrain_lock);
  644 }
  645 
  646 /*
  647  * Release a cached vnode.  Used from vdrain_thread only.
  648  */
  649 static __inline void
  650 vdrain_vrele(vnode_t *vp)
  651 {
  652         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
  653         struct mount *mp;
  654 
  655         KASSERT(mutex_owned(&vdrain_lock));
  656 
  657         mp = vp->v_mount;
  658         if (fstrans_start_nowait(mp) != 0)
  659                 return;
  660 
  661         /*
  662          * First remove the vnode from the vrele list.
  663          * Put it on the last lru list, the last vrele()
  664          * will put it back onto the right list before
  665          * its usecount reaches zero.
  666          */
  667         KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
  668         TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
  669         vip->vi_lrulisthd = &lru_list[LRU_HOLD];
  670         vip->vi_lrulisttm = getticks();
  671         TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
  672 
  673         vdrain_retry = true;
  674         mutex_exit(&vdrain_lock);
  675 
  676         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  677         mutex_enter(vp->v_interlock);
  678         vrelel(vp, 0, LK_EXCLUSIVE);
  679         fstrans_done(mp);
  680 
  681         mutex_enter(&vdrain_lock);
  682 }
  683 
  684 /*
  685  * Helper thread to keep the number of vnodes below desiredvnodes
  686  * and release vnodes from asynchronous vrele.
  687  */
  688 static void
  689 vdrain_thread(void *cookie)
  690 {
  691         int i;
  692         u_int target;
  693         vnode_impl_t *vip, *marker;
  694 
  695         marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
  696 
  697         mutex_enter(&vdrain_lock);
  698 
  699         for (;;) {
  700                 vdrain_retry = false;
  701                 target = desiredvnodes - desiredvnodes/10;
  702 
  703                 for (i = 0; i < LRU_COUNT; i++) {
  704                         TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist);
  705                         while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
  706                                 TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
  707                                 TAILQ_INSERT_AFTER(&lru_list[i], vip, marker,
  708                                     vi_lrulist);
  709                                 if (vnis_marker(VIMPL_TO_VNODE(vip)))
  710                                         continue;
  711                                 if (i == LRU_VRELE)
  712                                         vdrain_vrele(VIMPL_TO_VNODE(vip));
  713                                 else if (numvnodes < target)
  714                                         break;
  715                                 else
  716                                         vdrain_remove(VIMPL_TO_VNODE(vip));
  717                         }
  718                         TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist);
  719                 }
  720 
  721                 if (vdrain_retry) {
  722                         kpause("vdrainrt", false, 1, &vdrain_lock);
  723                 } else {
  724                         vdrain_gen++;
  725                         cv_broadcast(&vdrain_gen_cv);
  726                         cv_wait(&vdrain_cv, &vdrain_lock);
  727                 }
  728         }
  729 }
  730 
  731 /*
  732  * Try to drop reference on a vnode.  Abort if we are releasing the
  733  * last reference.  Note: this _must_ succeed if not the last reference.
  734  */
  735 static bool
  736 vtryrele(vnode_t *vp)
  737 {
  738         u_int use, next;
  739 
  740 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  741         membar_release();
  742 #endif
  743         for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
  744                 if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
  745                         return false;
  746                 }
  747                 KASSERT((use & VUSECOUNT_MASK) > 1);
  748                 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
  749                 if (__predict_true(next == use)) {
  750                         return true;
  751                 }
  752         }
  753 }
  754 
  755 /*
  756  * vput: unlock and release the reference.
  757  */
  758 void
  759 vput(vnode_t *vp)
  760 {
  761         int lktype;
  762 
  763         /*
  764          * Do an unlocked check of the usecount.  If it looks like we're not
  765          * about to drop the last reference, then unlock the vnode and try
  766          * to drop the reference.  If it ends up being the last reference
  767          * after all, vrelel() can fix it all up.  Most of the time this
  768          * will all go to plan.
  769          */
  770         if (vrefcnt(vp) > 1) {
  771                 VOP_UNLOCK(vp);
  772                 if (vtryrele(vp)) {
  773                         return;
  774                 }
  775                 lktype = LK_NONE;
  776         } else {
  777                 lktype = VOP_ISLOCKED(vp);
  778                 KASSERT(lktype != LK_NONE);
  779         }
  780         mutex_enter(vp->v_interlock);
  781         vrelel(vp, 0, lktype);
  782 }
  783 
  784 /*
  785  * Vnode release.  If reference count drops to zero, call inactive
  786  * routine and either return to freelist or free to the pool.
  787  */
  788 static void
  789 vrelel(vnode_t *vp, int flags, int lktype)
  790 {
  791         const bool async = ((flags & VRELEL_ASYNC) != 0);
  792         bool recycle, defer, objlock_held;
  793         u_int use, next;
  794         int error;
  795 
  796         objlock_held = false;
  797 
  798 retry:
  799         KASSERT(mutex_owned(vp->v_interlock));
  800 
  801         if (__predict_false(vp->v_op == dead_vnodeop_p &&
  802             VSTATE_GET(vp) != VS_RECLAIMED)) {
  803                 vnpanic(vp, "dead but not clean");
  804         }
  805 
  806         /*
  807          * If not the last reference, just unlock and drop the reference count.
  808          *
  809          * Otherwise make sure we pass a point in time where we hold the
  810          * last reference with VGET flag unset.
  811          */
  812         for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
  813                 if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
  814                         if (objlock_held) {
  815                                 objlock_held = false;
  816                                 rw_exit(vp->v_uobj.vmobjlock);
  817                         }
  818                         if (lktype != LK_NONE) {
  819                                 mutex_exit(vp->v_interlock);
  820                                 lktype = LK_NONE;
  821                                 VOP_UNLOCK(vp);
  822                                 mutex_enter(vp->v_interlock);
  823                         }
  824                         if (vtryrele(vp)) {
  825                                 mutex_exit(vp->v_interlock);
  826                                 return;
  827                         }
  828                         next = atomic_load_relaxed(&vp->v_usecount);
  829                         continue;
  830                 }
  831                 KASSERT((use & VUSECOUNT_MASK) == 1);
  832                 next = use & ~VUSECOUNT_VGET;
  833                 if (next != use) {
  834                         next = atomic_cas_uint(&vp->v_usecount, use, next);
  835                 }
  836                 if (__predict_true(next == use)) {
  837                         break;
  838                 }
  839         }
  840 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  841         membar_acquire();
  842 #endif
  843         if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
  844                 vnpanic(vp, "%s: bad ref count", __func__);
  845         }
  846 
  847 #ifdef DIAGNOSTIC
  848         if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
  849             vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
  850                 vprint("vrelel: missing VOP_CLOSE()", vp);
  851         }
  852 #endif
  853 
  854         /*
  855          * If already clean there is no need to lock, defer or
  856          * deactivate this node.
  857          */
  858         if (VSTATE_GET(vp) == VS_RECLAIMED) {
  859                 if (objlock_held) {
  860                         objlock_held = false;
  861                         rw_exit(vp->v_uobj.vmobjlock);
  862                 }
  863                 if (lktype != LK_NONE) {
  864                         mutex_exit(vp->v_interlock);
  865                         lktype = LK_NONE;
  866                         VOP_UNLOCK(vp);
  867                         mutex_enter(vp->v_interlock);
  868                 }
  869                 goto out;
  870         }
  871 
  872         /*
  873          * First try to get the vnode locked for VOP_INACTIVE().
  874          * Defer vnode release to vdrain_thread if caller requests
  875          * it explicitly, is the pagedaemon or the lock failed.
  876          */
  877         defer = false;
  878         if ((curlwp == uvm.pagedaemon_lwp) || async) {
  879                 defer = true;
  880         } else if (lktype == LK_SHARED) {
  881                 /* Excellent chance of getting, if the last ref. */
  882                 error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
  883                 if (error != 0) {
  884                         defer = true;
  885                 } else {
  886                         lktype = LK_EXCLUSIVE;
  887                 }
  888         } else if (lktype == LK_NONE) {
  889                 /* Excellent chance of getting, if the last ref. */
  890                 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
  891                 if (error != 0) {
  892                         defer = true;
  893                 } else {
  894                         lktype = LK_EXCLUSIVE;
  895                 }
  896         }
  897         KASSERT(mutex_owned(vp->v_interlock));
  898         if (defer) {
  899                 /*
  900                  * Defer reclaim to the kthread; it's not safe to
  901                  * clean it here.  We donate it our last reference.
  902                  */
  903                 if (lktype != LK_NONE) {
  904                         mutex_exit(vp->v_interlock);
  905                         VOP_UNLOCK(vp);
  906                         mutex_enter(vp->v_interlock);
  907                 }
  908                 lru_requeue(vp, &lru_list[LRU_VRELE]);
  909                 mutex_exit(vp->v_interlock);
  910                 return;
  911         }
  912         KASSERT(lktype == LK_EXCLUSIVE);
  913 
  914         /* If the node gained another reference, retry. */
  915         use = atomic_load_relaxed(&vp->v_usecount);
  916         if ((use & VUSECOUNT_VGET) != 0) {
  917                 goto retry;
  918         }
  919         KASSERT((use & VUSECOUNT_MASK) == 1);
  920 
  921         if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
  922             (vp->v_vflag & VV_MAPPED) != 0) {
  923                 /* Take care of space accounting. */
  924                 if (!objlock_held) {
  925                         objlock_held = true;
  926                         if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
  927                                 mutex_exit(vp->v_interlock);
  928                                 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
  929                                 mutex_enter(vp->v_interlock);
  930                                 goto retry;
  931                         }
  932                 }
  933                 if ((vp->v_iflag & VI_EXECMAP) != 0) {
  934                         cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
  935                 }
  936                 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
  937                 vp->v_vflag &= ~VV_MAPPED;
  938         }
  939         if (objlock_held) {
  940                 objlock_held = false;
  941                 rw_exit(vp->v_uobj.vmobjlock);
  942         }
  943 
  944         /*
  945          * Deactivate the vnode, but preserve our reference across
  946          * the call to VOP_INACTIVE().
  947          *
  948          * If VOP_INACTIVE() indicates that the file has been
  949          * deleted, then recycle the vnode.
  950          *
  951          * Note that VOP_INACTIVE() will not drop the vnode lock.
  952          */
  953         mutex_exit(vp->v_interlock);
  954         recycle = false;
  955         VOP_INACTIVE(vp, &recycle);
  956         if (!recycle) {
  957                 lktype = LK_NONE;
  958                 VOP_UNLOCK(vp);
  959         }
  960         mutex_enter(vp->v_interlock);
  961 
  962         /*
  963          * Block new references then check again to see if a
  964          * new reference was acquired in the meantime.  If
  965          * it was, restore the vnode state and try again.
  966          */
  967         if (recycle) {
  968                 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
  969                 use = atomic_load_relaxed(&vp->v_usecount);
  970                 if ((use & VUSECOUNT_VGET) != 0) {
  971                         VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
  972                         goto retry;
  973                 }
  974                 KASSERT((use & VUSECOUNT_MASK) == 1);
  975         }
  976 
  977         /*
  978          * Recycle the vnode if the file is now unused (unlinked).
  979          */
  980         if (recycle) {
  981                 VSTATE_ASSERT(vp, VS_BLOCKED);
  982                 KASSERT(lktype == LK_EXCLUSIVE);
  983                 /* vcache_reclaim drops the lock. */
  984                 lktype = LK_NONE;
  985                 vcache_reclaim(vp);
  986         }
  987         KASSERT(vrefcnt(vp) > 0);
  988         KASSERT(lktype == LK_NONE);
  989 
  990 out:
  991         for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
  992                 if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
  993                     (use & VUSECOUNT_MASK) == 1)) {
  994                         /* Gained and released another reference, retry. */
  995                         goto retry;
  996                 }
  997                 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
  998                 if (__predict_true(next == use)) {
  999                         if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
 1000                                 /* Gained another reference. */
 1001                                 mutex_exit(vp->v_interlock);
 1002                                 return;
 1003                         }
 1004                         break;
 1005                 }
 1006         }
 1007 #ifndef __HAVE_ATOMIC_AS_MEMBAR
 1008         membar_acquire();
 1009 #endif
 1010 
 1011         if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
 1012                 /*
 1013                  * It's clean so destroy it.  It isn't referenced
 1014                  * anywhere since it has been reclaimed.
 1015                  */
 1016                 vcache_free(VNODE_TO_VIMPL(vp));
 1017         } else {
 1018                 /*
 1019                  * Otherwise, put it back onto the freelist.  It
 1020                  * can't be destroyed while still associated with
 1021                  * a file system.
 1022                  */
 1023                 lru_requeue(vp, lru_which(vp));
 1024                 mutex_exit(vp->v_interlock);
 1025         }
 1026 }
 1027 
 1028 void
 1029 vrele(vnode_t *vp)
 1030 {
 1031 
 1032         if (vtryrele(vp)) {
 1033                 return;
 1034         }
 1035         mutex_enter(vp->v_interlock);
 1036         vrelel(vp, 0, LK_NONE);
 1037 }
 1038 
 1039 /*
 1040  * Asynchronous vnode release, vnode is released in different context.
 1041  */
 1042 void
 1043 vrele_async(vnode_t *vp)
 1044 {
 1045 
 1046         if (vtryrele(vp)) {
 1047                 return;
 1048         }
 1049         mutex_enter(vp->v_interlock);
 1050         vrelel(vp, VRELEL_ASYNC, LK_NONE);
 1051 }
 1052 
 1053 /*
 1054  * Vnode reference, where a reference is already held by some other
 1055  * object (for example, a file structure).
 1056  *
 1057  * NB: lockless code sequences may rely on this not blocking.
 1058  */
 1059 void
 1060 vref(vnode_t *vp)
 1061 {
 1062 
 1063         KASSERT(vrefcnt(vp) > 0);
 1064 
 1065         atomic_inc_uint(&vp->v_usecount);
 1066 }
 1067 
 1068 /*
 1069  * Page or buffer structure gets a reference.
 1070  * Called with v_interlock held.
 1071  */
 1072 void
 1073 vholdl(vnode_t *vp)
 1074 {
 1075 
 1076         KASSERT(mutex_owned(vp->v_interlock));
 1077 
 1078         if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0)
 1079                 lru_requeue(vp, lru_which(vp));
 1080 }
 1081 
 1082 /*
 1083  * Page or buffer structure gets a reference.
 1084  */
 1085 void
 1086 vhold(vnode_t *vp)
 1087 {
 1088 
 1089         mutex_enter(vp->v_interlock);
 1090         vholdl(vp);
 1091         mutex_exit(vp->v_interlock);
 1092 }
 1093 
 1094 /*
 1095  * Page or buffer structure frees a reference.
 1096  * Called with v_interlock held.
 1097  */
 1098 void
 1099 holdrelel(vnode_t *vp)
 1100 {
 1101 
 1102         KASSERT(mutex_owned(vp->v_interlock));
 1103 
 1104         if (vp->v_holdcnt <= 0) {
 1105                 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
 1106         }
 1107 
 1108         vp->v_holdcnt--;
 1109         if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
 1110                 lru_requeue(vp, lru_which(vp));
 1111 }
 1112 
 1113 /*
 1114  * Page or buffer structure frees a reference.
 1115  */
 1116 void
 1117 holdrele(vnode_t *vp)
 1118 {
 1119 
 1120         mutex_enter(vp->v_interlock);
 1121         holdrelel(vp);
 1122         mutex_exit(vp->v_interlock);
 1123 }
 1124 
 1125 /*
 1126  * Recycle an unused vnode if caller holds the last reference.
 1127  */
 1128 bool
 1129 vrecycle(vnode_t *vp)
 1130 {
 1131         int error __diagused;
 1132 
 1133         mutex_enter(vp->v_interlock);
 1134 
 1135         /* If the vnode is already clean we're done. */
 1136         VSTATE_WAIT_STABLE(vp);
 1137         if (VSTATE_GET(vp) != VS_LOADED) {
 1138                 VSTATE_ASSERT(vp, VS_RECLAIMED);
 1139                 vrelel(vp, 0, LK_NONE);
 1140                 return true;
 1141         }
 1142 
 1143         /* Prevent further references until the vnode is locked. */
 1144         VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
 1145 
 1146         /* Make sure we hold the last reference. */
 1147         if (vrefcnt(vp) != 1) {
 1148                 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
 1149                 mutex_exit(vp->v_interlock);
 1150                 return false;
 1151         }
 1152 
 1153         mutex_exit(vp->v_interlock);
 1154 
 1155         /*
 1156          * On a leaf file system this lock will always succeed as we hold
 1157          * the last reference and prevent further references.
 1158          * On layered file systems waiting for the lock would open a can of
 1159          * deadlocks as the lower vnodes may have other active references.
 1160          */
 1161         error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
 1162 
 1163         mutex_enter(vp->v_interlock);
 1164         if (error) {
 1165                 VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
 1166                 mutex_exit(vp->v_interlock);
 1167                 return false;
 1168         }
 1169 
 1170         KASSERT(vrefcnt(vp) == 1);
 1171         vcache_reclaim(vp);
 1172         vrelel(vp, 0, LK_NONE);
 1173 
 1174         return true;
 1175 }
 1176 
 1177 /*
 1178  * Helper for vrevoke() to propagate suspension from lastmp
 1179  * to thismp.  Both args may be NULL.
 1180  * Returns the currently suspended file system or NULL.
 1181  */
 1182 static struct mount *
 1183 vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
 1184 {
 1185         int error;
 1186 
 1187         if (lastmp == thismp)
 1188                 return thismp;
 1189 
 1190         if (lastmp != NULL)
 1191                 vfs_resume(lastmp);
 1192 
 1193         if (thismp == NULL)
 1194                 return NULL;
 1195 
 1196         do {
 1197                 error = vfs_suspend(thismp, 0);
 1198         } while (error == EINTR || error == ERESTART);
 1199 
 1200         if (error == 0)
 1201                 return thismp;
 1202 
 1203         KASSERT(error == EOPNOTSUPP || error == ENOENT);
 1204         return NULL;
 1205 }
 1206 
 1207 /*
 1208  * Eliminate all activity associated with the requested vnode
 1209  * and with all vnodes aliased to the requested vnode.
 1210  */
 1211 void
 1212 vrevoke(vnode_t *vp)
 1213 {
 1214         struct mount *mp;
 1215         vnode_t *vq;
 1216         enum vtype type;
 1217         dev_t dev;
 1218 
 1219         KASSERT(vrefcnt(vp) > 0);
 1220 
 1221         mp = vrevoke_suspend_next(NULL, vp->v_mount);
 1222 
 1223         mutex_enter(vp->v_interlock);
 1224         VSTATE_WAIT_STABLE(vp);
 1225         if (VSTATE_GET(vp) == VS_RECLAIMED) {
 1226                 mutex_exit(vp->v_interlock);
 1227         } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
 1228                 atomic_inc_uint(&vp->v_usecount);
 1229                 mutex_exit(vp->v_interlock);
 1230                 vgone(vp);
 1231         } else {
 1232                 dev = vp->v_rdev;
 1233                 type = vp->v_type;
 1234                 mutex_exit(vp->v_interlock);
 1235 
 1236                 while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
 1237                     == 0) {
 1238                         mp = vrevoke_suspend_next(mp, vq->v_mount);
 1239                         vgone(vq);
 1240                 }
 1241         }
 1242         vrevoke_suspend_next(mp, NULL);
 1243 }
 1244 
 1245 /*
 1246  * Eliminate all activity associated with a vnode in preparation for
 1247  * reuse.  Drops a reference from the vnode.
 1248  */
 1249 void
 1250 vgone(vnode_t *vp)
 1251 {
 1252         int lktype;
 1253 
 1254         KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
 1255 
 1256         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1257         lktype = LK_EXCLUSIVE;
 1258         mutex_enter(vp->v_interlock);
 1259         VSTATE_WAIT_STABLE(vp);
 1260         if (VSTATE_GET(vp) == VS_LOADED) {
 1261                 VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
 1262                 vcache_reclaim(vp);
 1263                 lktype = LK_NONE;
 1264         }
 1265         VSTATE_ASSERT(vp, VS_RECLAIMED);
 1266         vrelel(vp, 0, lktype);
 1267 }
 1268 
 1269 static inline uint32_t
 1270 vcache_hash(const struct vcache_key *key)
 1271 {
 1272         uint32_t hash = HASH32_BUF_INIT;
 1273 
 1274         KASSERT(key->vk_key_len > 0);
 1275 
 1276         hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
 1277         hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
 1278         return hash;
 1279 }
 1280 
 1281 static int
 1282 vcache_stats(struct hashstat_sysctl *hs, bool fill)
 1283 {
 1284         vnode_impl_t *vip;
 1285         uint64_t chain;
 1286 
 1287         strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
 1288         strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
 1289         if (!fill)
 1290                 return 0;
 1291 
 1292         hs->hash_size = vcache_hashmask + 1;
 1293 
 1294         for (size_t i = 0; i < hs->hash_size; i++) {
 1295                 chain = 0;
 1296                 mutex_enter(&vcache_lock);
 1297                 SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
 1298                         chain++;
 1299                 }
 1300                 mutex_exit(&vcache_lock);
 1301                 if (chain > 0) {
 1302                         hs->hash_used++;
 1303                         hs->hash_items += chain;
 1304                         if (chain > hs->hash_maxchain)
 1305                                 hs->hash_maxchain = chain;
 1306                 }
 1307                 preempt_point();
 1308         }
 1309 
 1310         return 0;
 1311 }
 1312 
 1313 static void
 1314 vcache_init(void)
 1315 {
 1316 
 1317         vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
 1318             0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
 1319         KASSERT(vcache_pool != NULL);
 1320         mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
 1321         cv_init(&vcache_cv, "vcache");
 1322         vcache_hashsize = desiredvnodes;
 1323         vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
 1324             &vcache_hashmask);
 1325         hashstat_register("vcache", vcache_stats);
 1326 }
 1327 
 1328 static void
 1329 vcache_reinit(void)
 1330 {
 1331         int i;
 1332         uint32_t hash;
 1333         u_long oldmask, newmask;
 1334         struct hashhead *oldtab, *newtab;
 1335         vnode_impl_t *vip;
 1336 
 1337         newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
 1338         mutex_enter(&vcache_lock);
 1339         oldtab = vcache_hashtab;
 1340         oldmask = vcache_hashmask;
 1341         vcache_hashsize = desiredvnodes;
 1342         vcache_hashtab = newtab;
 1343         vcache_hashmask = newmask;
 1344         for (i = 0; i <= oldmask; i++) {
 1345                 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
 1346                         SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
 1347                         hash = vcache_hash(&vip->vi_key);
 1348                         SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
 1349                             vip, vi_hash);
 1350                 }
 1351         }
 1352         mutex_exit(&vcache_lock);
 1353         hashdone(oldtab, HASH_SLIST, oldmask);
 1354 }
 1355 
 1356 static inline vnode_impl_t *
 1357 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
 1358 {
 1359         struct hashhead *hashp;
 1360         vnode_impl_t *vip;
 1361 
 1362         KASSERT(mutex_owned(&vcache_lock));
 1363 
 1364         hashp = &vcache_hashtab[hash & vcache_hashmask];
 1365         SLIST_FOREACH(vip, hashp, vi_hash) {
 1366                 if (key->vk_mount != vip->vi_key.vk_mount)
 1367                         continue;
 1368                 if (key->vk_key_len != vip->vi_key.vk_key_len)
 1369                         continue;
 1370                 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
 1371                         continue;
 1372                 return vip;
 1373         }
 1374         return NULL;
 1375 }
 1376 
 1377 /*
 1378  * Allocate a new, uninitialized vcache node.
 1379  */
 1380 static vnode_impl_t *
 1381 vcache_alloc(void)
 1382 {
 1383         vnode_impl_t *vip;
 1384         vnode_t *vp;
 1385 
 1386         vip = pool_cache_get(vcache_pool, PR_WAITOK);
 1387         vp = VIMPL_TO_VNODE(vip);
 1388         memset(vip, 0, sizeof(*vip));
 1389 
 1390         rw_init(&vip->vi_lock);
 1391         vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 1392 
 1393         uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
 1394         klist_init(&vip->vi_klist.vk_klist);
 1395         vp->v_klist = &vip->vi_klist;
 1396         cv_init(&vp->v_cv, "vnode");
 1397         cache_vnode_init(vp);
 1398 
 1399         vp->v_usecount = 1;
 1400         vp->v_type = VNON;
 1401         vp->v_size = vp->v_writesize = VSIZENOTSET;
 1402 
 1403         vip->vi_state = VS_LOADING;
 1404 
 1405         lru_requeue(vp, &lru_list[LRU_FREE]);
 1406 
 1407         return vip;
 1408 }
 1409 
 1410 /*
 1411  * Deallocate a vcache node in state VS_LOADING.
 1412  *
 1413  * vcache_lock held on entry and released on return.
 1414  */
 1415 static void
 1416 vcache_dealloc(vnode_impl_t *vip)
 1417 {
 1418         vnode_t *vp;
 1419 
 1420         KASSERT(mutex_owned(&vcache_lock));
 1421 
 1422         vp = VIMPL_TO_VNODE(vip);
 1423         vfs_ref(dead_rootmount);
 1424         vfs_insmntque(vp, dead_rootmount);
 1425         mutex_enter(vp->v_interlock);
 1426         vp->v_op = dead_vnodeop_p;
 1427         VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
 1428         mutex_exit(&vcache_lock);
 1429         vrelel(vp, 0, LK_NONE);
 1430 }
 1431 
 1432 /*
 1433  * Free an unused, unreferenced vcache node.
 1434  * v_interlock locked on entry.
 1435  */
 1436 static void
 1437 vcache_free(vnode_impl_t *vip)
 1438 {
 1439         vnode_t *vp;
 1440 
 1441         vp = VIMPL_TO_VNODE(vip);
 1442         KASSERT(mutex_owned(vp->v_interlock));
 1443 
 1444         KASSERT(vrefcnt(vp) == 0);
 1445         KASSERT(vp->v_holdcnt == 0);
 1446         KASSERT(vp->v_writecount == 0);
 1447         lru_requeue(vp, NULL);
 1448         mutex_exit(vp->v_interlock);
 1449 
 1450         vfs_insmntque(vp, NULL);
 1451         if (vp->v_type == VBLK || vp->v_type == VCHR)
 1452                 spec_node_destroy(vp);
 1453 
 1454         mutex_obj_free(vp->v_interlock);
 1455         rw_destroy(&vip->vi_lock);
 1456         uvm_obj_destroy(&vp->v_uobj, true);
 1457         KASSERT(vp->v_klist == &vip->vi_klist);
 1458         klist_fini(&vip->vi_klist.vk_klist);
 1459         cv_destroy(&vp->v_cv);
 1460         cache_vnode_fini(vp);
 1461         pool_cache_put(vcache_pool, vip);
 1462 }
 1463 
 1464 /*
 1465  * Try to get an initial reference on this cached vnode.
 1466  * Returns zero on success or EBUSY if the vnode state is not LOADED.
 1467  *
 1468  * NB: lockless code sequences may rely on this not blocking.
 1469  */
 1470 int
 1471 vcache_tryvget(vnode_t *vp)
 1472 {
 1473         u_int use, next;
 1474 
 1475         for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
 1476                 if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
 1477                         return EBUSY;
 1478                 }
 1479                 next = atomic_cas_uint(&vp->v_usecount,
 1480                     use, (use + 1) | VUSECOUNT_VGET);
 1481                 if (__predict_true(next == use)) {
 1482 #ifndef __HAVE_ATOMIC_AS_MEMBAR
 1483                         membar_acquire();
 1484 #endif
 1485                         return 0;
 1486                 }
 1487         }
 1488 }
 1489 
 1490 /*
 1491  * Try to get an initial reference on this cached vnode.
 1492  * Returns zero on success and  ENOENT if the vnode has been reclaimed.
 1493  * Will wait for the vnode state to be stable.
 1494  *
 1495  * v_interlock locked on entry and unlocked on exit.
 1496  */
 1497 int
 1498 vcache_vget(vnode_t *vp)
 1499 {
 1500         int error;
 1501 
 1502         KASSERT(mutex_owned(vp->v_interlock));
 1503 
 1504         /* Increment hold count to prevent vnode from disappearing. */
 1505         vp->v_holdcnt++;
 1506         VSTATE_WAIT_STABLE(vp);
 1507         vp->v_holdcnt--;
 1508 
 1509         /* If this was the last reference to a reclaimed vnode free it now. */
 1510         if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
 1511                 if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
 1512                         vcache_free(VNODE_TO_VIMPL(vp));
 1513                 else
 1514                         mutex_exit(vp->v_interlock);
 1515                 return ENOENT;
 1516         }
 1517         VSTATE_ASSERT(vp, VS_LOADED);
 1518         error = vcache_tryvget(vp);
 1519         KASSERT(error == 0);
 1520         mutex_exit(vp->v_interlock);
 1521 
 1522         return 0;
 1523 }
 1524 
 1525 /*
 1526  * Get a vnode / fs node pair by key and return it referenced through vpp.
 1527  */
 1528 int
 1529 vcache_get(struct mount *mp, const void *key, size_t key_len,
 1530     struct vnode **vpp)
 1531 {
 1532         int error;
 1533         uint32_t hash;
 1534         const void *new_key;
 1535         struct vnode *vp;
 1536         struct vcache_key vcache_key;
 1537         vnode_impl_t *vip, *new_vip;
 1538 
 1539         new_key = NULL;
 1540         *vpp = NULL;
 1541 
 1542         vcache_key.vk_mount = mp;
 1543         vcache_key.vk_key = key;
 1544         vcache_key.vk_key_len = key_len;
 1545         hash = vcache_hash(&vcache_key);
 1546 
 1547 again:
 1548         mutex_enter(&vcache_lock);
 1549         vip = vcache_hash_lookup(&vcache_key, hash);
 1550 
 1551         /* If found, take a reference or retry. */
 1552         if (__predict_true(vip != NULL)) {
 1553                 /*
 1554                  * If the vnode is loading we cannot take the v_interlock
 1555                  * here as it might change during load (see uvm_obj_setlock()).
 1556                  * As changing state from VS_LOADING requires both vcache_lock
 1557                  * and v_interlock it is safe to test with vcache_lock held.
 1558                  *
 1559                  * Wait for vnodes changing state from VS_LOADING and retry.
 1560                  */
 1561                 if (__predict_false(vip->vi_state == VS_LOADING)) {
 1562                         cv_wait(&vcache_cv, &vcache_lock);
 1563                         mutex_exit(&vcache_lock);
 1564                         goto again;
 1565                 }
 1566                 vp = VIMPL_TO_VNODE(vip);
 1567                 mutex_enter(vp->v_interlock);
 1568                 mutex_exit(&vcache_lock);
 1569                 error = vcache_vget(vp);
 1570                 if (error == ENOENT)
 1571                         goto again;
 1572                 if (error == 0)
 1573                         *vpp = vp;
 1574                 KASSERT((error != 0) == (*vpp == NULL));
 1575                 return error;
 1576         }
 1577         mutex_exit(&vcache_lock);
 1578 
 1579         /* Allocate and initialize a new vcache / vnode pair. */
 1580         error = vfs_busy(mp);
 1581         if (error)
 1582                 return error;
 1583         new_vip = vcache_alloc();
 1584         new_vip->vi_key = vcache_key;
 1585         vp = VIMPL_TO_VNODE(new_vip);
 1586         mutex_enter(&vcache_lock);
 1587         vip = vcache_hash_lookup(&vcache_key, hash);
 1588         if (vip == NULL) {
 1589                 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
 1590                     new_vip, vi_hash);
 1591                 vip = new_vip;
 1592         }
 1593 
 1594         /* If another thread beat us inserting this node, retry. */
 1595         if (vip != new_vip) {
 1596                 vcache_dealloc(new_vip);
 1597                 vfs_unbusy(mp);
 1598                 goto again;
 1599         }
 1600         mutex_exit(&vcache_lock);
 1601 
 1602         /* Load the fs node.  Exclusive as new_node is VS_LOADING. */
 1603         error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
 1604         if (error) {
 1605                 mutex_enter(&vcache_lock);
 1606                 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
 1607                     new_vip, vnode_impl, vi_hash);
 1608                 vcache_dealloc(new_vip);
 1609                 vfs_unbusy(mp);
 1610                 KASSERT(*vpp == NULL);
 1611                 return error;
 1612         }
 1613         KASSERT(new_key != NULL);
 1614         KASSERT(memcmp(key, new_key, key_len) == 0);
 1615         KASSERT(vp->v_op != NULL);
 1616         vfs_insmntque(vp, mp);
 1617         if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
 1618                 vp->v_vflag |= VV_MPSAFE;
 1619         vfs_ref(mp);
 1620         vfs_unbusy(mp);
 1621 
 1622         /* Finished loading, finalize node. */
 1623         mutex_enter(&vcache_lock);
 1624         new_vip->vi_key.vk_key = new_key;
 1625         mutex_enter(vp->v_interlock);
 1626         VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
 1627         mutex_exit(vp->v_interlock);
 1628         mutex_exit(&vcache_lock);
 1629         *vpp = vp;
 1630         return 0;
 1631 }
 1632 
 1633 /*
 1634  * Create a new vnode / fs node pair and return it referenced through vpp.
 1635  */
 1636 int
 1637 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
 1638     kauth_cred_t cred, void *extra, struct vnode **vpp)
 1639 {
 1640         int error;
 1641         uint32_t hash;
 1642         struct vnode *vp, *ovp;
 1643         vnode_impl_t *vip, *ovip;
 1644 
 1645         *vpp = NULL;
 1646 
 1647         /* Allocate and initialize a new vcache / vnode pair. */
 1648         error = vfs_busy(mp);
 1649         if (error)
 1650                 return error;
 1651         vip = vcache_alloc();
 1652         vip->vi_key.vk_mount = mp;
 1653         vp = VIMPL_TO_VNODE(vip);
 1654 
 1655         /* Create and load the fs node. */
 1656         error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
 1657             &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
 1658         if (error) {
 1659                 mutex_enter(&vcache_lock);
 1660                 vcache_dealloc(vip);
 1661                 vfs_unbusy(mp);
 1662                 KASSERT(*vpp == NULL);
 1663                 return error;
 1664         }
 1665         KASSERT(vp->v_op != NULL);
 1666         KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
 1667         if (vip->vi_key.vk_key_len > 0) {
 1668                 KASSERT(vip->vi_key.vk_key != NULL);
 1669                 hash = vcache_hash(&vip->vi_key);
 1670 
 1671                 /*
 1672                  * Wait for previous instance to be reclaimed,
 1673                  * then insert new node.
 1674                  */
 1675                 mutex_enter(&vcache_lock);
 1676                 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
 1677                         ovp = VIMPL_TO_VNODE(ovip);
 1678                         mutex_enter(ovp->v_interlock);
 1679                         mutex_exit(&vcache_lock);
 1680                         error = vcache_vget(ovp);
 1681                         KASSERT(error == ENOENT);
 1682                         mutex_enter(&vcache_lock);
 1683                 }
 1684                 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
 1685                     vip, vi_hash);
 1686                 mutex_exit(&vcache_lock);
 1687         }
 1688         vfs_insmntque(vp, mp);
 1689         if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
 1690                 vp->v_vflag |= VV_MPSAFE;
 1691         vfs_ref(mp);
 1692         vfs_unbusy(mp);
 1693 
 1694         /* Finished loading, finalize node. */
 1695         mutex_enter(&vcache_lock);
 1696         mutex_enter(vp->v_interlock);
 1697         VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
 1698         mutex_exit(&vcache_lock);
 1699         mutex_exit(vp->v_interlock);
 1700         *vpp = vp;
 1701         return 0;
 1702 }
 1703 
 1704 /*
 1705  * Prepare key change: update old cache nodes key and lock new cache node.
 1706  * Return an error if the new node already exists.
 1707  */
 1708 int
 1709 vcache_rekey_enter(struct mount *mp, struct vnode *vp,
 1710     const void *old_key, size_t old_key_len,
 1711     const void *new_key, size_t new_key_len)
 1712 {
 1713         uint32_t old_hash, new_hash;
 1714         struct vcache_key old_vcache_key, new_vcache_key;
 1715         vnode_impl_t *vip, *new_vip;
 1716 
 1717         old_vcache_key.vk_mount = mp;
 1718         old_vcache_key.vk_key = old_key;
 1719         old_vcache_key.vk_key_len = old_key_len;
 1720         old_hash = vcache_hash(&old_vcache_key);
 1721 
 1722         new_vcache_key.vk_mount = mp;
 1723         new_vcache_key.vk_key = new_key;
 1724         new_vcache_key.vk_key_len = new_key_len;
 1725         new_hash = vcache_hash(&new_vcache_key);
 1726 
 1727         new_vip = vcache_alloc();
 1728         new_vip->vi_key = new_vcache_key;
 1729 
 1730         /* Insert locked new node used as placeholder. */
 1731         mutex_enter(&vcache_lock);
 1732         vip = vcache_hash_lookup(&new_vcache_key, new_hash);
 1733         if (vip != NULL) {
 1734                 vcache_dealloc(new_vip);
 1735                 return EEXIST;
 1736         }
 1737         SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
 1738             new_vip, vi_hash);
 1739 
 1740         /* Replace old nodes key with the temporary copy. */
 1741         vip = vcache_hash_lookup(&old_vcache_key, old_hash);
 1742         KASSERT(vip != NULL);
 1743         KASSERT(VIMPL_TO_VNODE(vip) == vp);
 1744         KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
 1745         vip->vi_key = old_vcache_key;
 1746         mutex_exit(&vcache_lock);
 1747         return 0;
 1748 }
 1749 
 1750 /*
 1751  * Key change complete: update old node and remove placeholder.
 1752  */
 1753 void
 1754 vcache_rekey_exit(struct mount *mp, struct vnode *vp,
 1755     const void *old_key, size_t old_key_len,
 1756     const void *new_key, size_t new_key_len)
 1757 {
 1758         uint32_t old_hash, new_hash;
 1759         struct vcache_key old_vcache_key, new_vcache_key;
 1760         vnode_impl_t *vip, *new_vip;
 1761         struct vnode *new_vp;
 1762 
 1763         old_vcache_key.vk_mount = mp;
 1764         old_vcache_key.vk_key = old_key;
 1765         old_vcache_key.vk_key_len = old_key_len;
 1766         old_hash = vcache_hash(&old_vcache_key);
 1767 
 1768         new_vcache_key.vk_mount = mp;
 1769         new_vcache_key.vk_key = new_key;
 1770         new_vcache_key.vk_key_len = new_key_len;
 1771         new_hash = vcache_hash(&new_vcache_key);
 1772 
 1773         mutex_enter(&vcache_lock);
 1774 
 1775         /* Lookup old and new node. */
 1776         vip = vcache_hash_lookup(&old_vcache_key, old_hash);
 1777         KASSERT(vip != NULL);
 1778         KASSERT(VIMPL_TO_VNODE(vip) == vp);
 1779 
 1780         new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
 1781         KASSERT(new_vip != NULL);
 1782         KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
 1783         new_vp = VIMPL_TO_VNODE(new_vip);
 1784         mutex_enter(new_vp->v_interlock);
 1785         VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
 1786         mutex_exit(new_vp->v_interlock);
 1787 
 1788         /* Rekey old node and put it onto its new hashlist. */
 1789         vip->vi_key = new_vcache_key;
 1790         if (old_hash != new_hash) {
 1791                 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
 1792                     vip, vnode_impl, vi_hash);
 1793                 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
 1794                     vip, vi_hash);
 1795         }
 1796 
 1797         /* Remove new node used as placeholder. */
 1798         SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
 1799             new_vip, vnode_impl, vi_hash);
 1800         vcache_dealloc(new_vip);
 1801 }
 1802 
 1803 /*
 1804  * Disassociate the underlying file system from a vnode.
 1805  *
 1806  * Must be called with vnode locked and will return unlocked.
 1807  * Must be called with the interlock held, and will return with it held.
 1808  */
 1809 static void
 1810 vcache_reclaim(vnode_t *vp)
 1811 {
 1812         lwp_t *l = curlwp;
 1813         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
 1814         struct mount *mp = vp->v_mount;
 1815         uint32_t hash;
 1816         uint8_t temp_buf[64], *temp_key;
 1817         size_t temp_key_len;
 1818         bool recycle;
 1819         int error;
 1820 
 1821         KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
 1822         KASSERT(mutex_owned(vp->v_interlock));
 1823         KASSERT(vrefcnt(vp) != 0);
 1824 
 1825         temp_key_len = vip->vi_key.vk_key_len;
 1826         /*
 1827          * Prevent the vnode from being recycled or brought into use
 1828          * while we clean it out.
 1829          */
 1830         VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
 1831 
 1832         /*
 1833          * Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
 1834          * because VOP_RECLAIM() could cause vp->v_klist to
 1835          * become invalid.  Don't check for interest in NOTE_REVOKE
 1836          * here; it's always posted because it sets EV_EOF.
 1837          *
 1838          * Once it's been posted, reset vp->v_klist to point to
 1839          * our own local storage, in case we were sharing with
 1840          * someone else.
 1841          */
 1842         KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
 1843         vp->v_klist = &vip->vi_klist;
 1844         mutex_exit(vp->v_interlock);
 1845 
 1846         rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
 1847         mutex_enter(vp->v_interlock);
 1848         if ((vp->v_iflag & VI_EXECMAP) != 0) {
 1849                 cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
 1850         }
 1851         vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
 1852         vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
 1853         mutex_exit(vp->v_interlock);
 1854         rw_exit(vp->v_uobj.vmobjlock);
 1855 
 1856         /*
 1857          * With vnode state set to reclaiming, purge name cache immediately
 1858          * to prevent new handles on vnode, and wait for existing threads
 1859          * trying to get a handle to notice VS_RECLAIMED status and abort.
 1860          */
 1861         cache_purge(vp);
 1862 
 1863         /* Replace the vnode key with a temporary copy. */
 1864         if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
 1865                 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
 1866         } else {
 1867                 temp_key = temp_buf;
 1868         }
 1869         if (vip->vi_key.vk_key_len > 0) {
 1870                 mutex_enter(&vcache_lock);
 1871                 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
 1872                 vip->vi_key.vk_key = temp_key;
 1873                 mutex_exit(&vcache_lock);
 1874         }
 1875 
 1876         fstrans_start(mp);
 1877 
 1878         /*
 1879          * Clean out any cached data associated with the vnode.
 1880          */
 1881         error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
 1882         if (error != 0) {
 1883                 if (wapbl_vphaswapbl(vp))
 1884                         WAPBL_DISCARD(wapbl_vptomp(vp));
 1885                 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
 1886         }
 1887         KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
 1888         KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
 1889         if (vp->v_type == VBLK || vp->v_type == VCHR) {
 1890                  spec_node_revoke(vp);
 1891         }
 1892 
 1893         /*
 1894          * Disassociate the underlying file system from the vnode.
 1895          * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
 1896          * the vnode, and may destroy the vnode so that VOP_UNLOCK
 1897          * would no longer function.
 1898          */
 1899         VOP_INACTIVE(vp, &recycle);
 1900         KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
 1901         if (VOP_RECLAIM(vp)) {
 1902                 vnpanic(vp, "%s: cannot reclaim", __func__);
 1903         }
 1904 
 1905         KASSERT(vp->v_data == NULL);
 1906         KASSERT((vp->v_iflag & VI_PAGES) == 0);
 1907 
 1908         if (vp->v_type == VREG && vp->v_ractx != NULL) {
 1909                 uvm_ra_freectx(vp->v_ractx);
 1910                 vp->v_ractx = NULL;
 1911         }
 1912 
 1913         if (vip->vi_key.vk_key_len > 0) {
 1914         /* Remove from vnode cache. */
 1915                 hash = vcache_hash(&vip->vi_key);
 1916                 mutex_enter(&vcache_lock);
 1917                 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
 1918                 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
 1919                     vip, vnode_impl, vi_hash);
 1920                 mutex_exit(&vcache_lock);
 1921         }
 1922         if (temp_key != temp_buf)
 1923                 kmem_free(temp_key, temp_key_len);
 1924 
 1925         /* Done with purge, notify sleepers of the grim news. */
 1926         mutex_enter(vp->v_interlock);
 1927         vp->v_op = dead_vnodeop_p;
 1928         VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
 1929         vp->v_tag = VT_NON;
 1930         mutex_exit(vp->v_interlock);
 1931 
 1932         /*
 1933          * Move to dead mount.  Must be after changing the operations
 1934          * vector as vnode operations enter the mount before using the
 1935          * operations vector.  See sys/kern/vnode_if.c.
 1936          */
 1937         vp->v_vflag &= ~VV_ROOT;
 1938         vfs_ref(dead_rootmount);
 1939         vfs_insmntque(vp, dead_rootmount);
 1940 
 1941 #ifdef PAX_SEGVGUARD
 1942         pax_segvguard_cleanup(vp);
 1943 #endif /* PAX_SEGVGUARD */
 1944 
 1945         mutex_enter(vp->v_interlock);
 1946         fstrans_done(mp);
 1947         KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
 1948 }
 1949 
 1950 /*
 1951  * Disassociate the underlying file system from an open device vnode
 1952  * and make it anonymous.
 1953  *
 1954  * Vnode unlocked on entry, drops a reference to the vnode.
 1955  */
 1956 void
 1957 vcache_make_anon(vnode_t *vp)
 1958 {
 1959         vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
 1960         uint32_t hash;
 1961         bool recycle;
 1962 
 1963         KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
 1964         KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
 1965         VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
 1966 
 1967         /* Remove from vnode cache. */
 1968         hash = vcache_hash(&vip->vi_key);
 1969         mutex_enter(&vcache_lock);
 1970         KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
 1971         SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
 1972             vip, vnode_impl, vi_hash);
 1973         vip->vi_key.vk_mount = dead_rootmount;
 1974         vip->vi_key.vk_key_len = 0;
 1975         vip->vi_key.vk_key = NULL;
 1976         mutex_exit(&vcache_lock);
 1977 
 1978         /*
 1979          * Disassociate the underlying file system from the vnode.
 1980          * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
 1981          * the vnode, and may destroy the vnode so that VOP_UNLOCK
 1982          * would no longer function.
 1983          */
 1984         if (vn_lock(vp, LK_EXCLUSIVE)) {
 1985                 vnpanic(vp, "%s: cannot lock", __func__);
 1986         }
 1987         VOP_INACTIVE(vp, &recycle);
 1988         KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
 1989         if (VOP_RECLAIM(vp)) {
 1990                 vnpanic(vp, "%s: cannot reclaim", __func__);
 1991         }
 1992 
 1993         /* Purge name cache. */
 1994         cache_purge(vp);
 1995 
 1996         /* Done with purge, change operations vector. */
 1997         mutex_enter(vp->v_interlock);
 1998         vp->v_op = spec_vnodeop_p;
 1999         vp->v_vflag |= VV_MPSAFE;
 2000         mutex_exit(vp->v_interlock);
 2001 
 2002         /*
 2003          * Move to dead mount.  Must be after changing the operations
 2004          * vector as vnode operations enter the mount before using the
 2005          * operations vector.  See sys/kern/vnode_if.c.
 2006          */
 2007         vfs_ref(dead_rootmount);
 2008         vfs_insmntque(vp, dead_rootmount);
 2009 
 2010         vrele(vp);
 2011 }
 2012 
 2013 /*
 2014  * Update outstanding I/O count and do wakeup if requested.
 2015  */
 2016 void
 2017 vwakeup(struct buf *bp)
 2018 {
 2019         vnode_t *vp;
 2020 
 2021         if ((vp = bp->b_vp) == NULL)
 2022                 return;
 2023 
 2024         KASSERT(bp->b_objlock == vp->v_interlock);
 2025         KASSERT(mutex_owned(bp->b_objlock));
 2026 
 2027         if (--vp->v_numoutput < 0)
 2028                 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
 2029         if (vp->v_numoutput == 0)
 2030                 cv_broadcast(&vp->v_cv);
 2031 }
 2032 
 2033 /*
 2034  * Test a vnode for being or becoming dead.  Returns one of:
 2035  * EBUSY:  vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
 2036  * ENOENT: vnode is dead.
 2037  * 0:      otherwise.
 2038  *
 2039  * Whenever this function returns a non-zero value all future
 2040  * calls will also return a non-zero value.
 2041  */
 2042 int
 2043 vdead_check(struct vnode *vp, int flags)
 2044 {
 2045 
 2046         KASSERT(mutex_owned(vp->v_interlock));
 2047 
 2048         if (! ISSET(flags, VDEAD_NOWAIT))
 2049                 VSTATE_WAIT_STABLE(vp);
 2050 
 2051         if (VSTATE_GET(vp) == VS_RECLAIMING) {
 2052                 KASSERT(ISSET(flags, VDEAD_NOWAIT));
 2053                 return EBUSY;
 2054         } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
 2055                 return ENOENT;
 2056         }
 2057 
 2058         return 0;
 2059 }
 2060 
 2061 int
 2062 vfs_drainvnodes(void)
 2063 {
 2064         int i, gen;
 2065 
 2066         mutex_enter(&vdrain_lock);
 2067         for (i = 0; i < 2; i++) {
 2068                 gen = vdrain_gen;
 2069                 while (gen == vdrain_gen) {
 2070                         cv_broadcast(&vdrain_cv);
 2071                         cv_wait(&vdrain_gen_cv, &vdrain_lock);
 2072                 }
 2073         }
 2074         mutex_exit(&vdrain_lock);
 2075 
 2076         if (numvnodes >= desiredvnodes)
 2077                 return EBUSY;
 2078 
 2079         if (vcache_hashsize != desiredvnodes)
 2080                 vcache_reinit();
 2081 
 2082         return 0;
 2083 }
 2084 
 2085 void
 2086 vnpanic(vnode_t *vp, const char *fmt, ...)
 2087 {
 2088         va_list ap;
 2089 
 2090 #ifdef DIAGNOSTIC
 2091         vprint(NULL, vp);
 2092 #endif
 2093         va_start(ap, fmt);
 2094         vpanic(fmt, ap);
 2095         va_end(ap);
 2096 }
 2097 
 2098 void
 2099 vshareilock(vnode_t *tvp, vnode_t *fvp)
 2100 {
 2101         kmutex_t *oldlock;
 2102 
 2103         oldlock = tvp->v_interlock;
 2104         mutex_obj_hold(fvp->v_interlock);
 2105         tvp->v_interlock = fvp->v_interlock;
 2106         mutex_obj_free(oldlock);
 2107 }
 2108 
 2109 void
 2110 vshareklist(vnode_t *tvp, vnode_t *fvp)
 2111 {
 2112         /*
 2113          * If two vnodes share klist state, they must also share
 2114          * an interlock.
 2115          */
 2116         KASSERT(tvp->v_interlock == fvp->v_interlock);
 2117 
 2118         /*
 2119          * We make the following assumptions:
 2120          *
 2121          * ==> Some other synchronization is happening outside of
 2122          *     our view to make this safe.
 2123          *
 2124          * ==> That the "to" vnode will have the necessary references
 2125          *     on the "from" vnode so that the storage for the klist
 2126          *     won't be yanked out from beneath us (the vnode_impl).
 2127          *
 2128          * ==> If "from" is also sharing, we then assume that "from"
 2129          *     has the necessary references, and so on.
 2130          */
 2131         tvp->v_klist = fvp->v_klist;
 2132 }

Cache object: ed700c81f2e7513ebb0b37d679c413ad


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.