The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993, 1995
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Poul-Henning Kamp of the FreeBSD Project.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_ddb.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/capsicum.h>
   46 #include <sys/counter.h>
   47 #include <sys/filedesc.h>
   48 #include <sys/fnv_hash.h>
   49 #include <sys/kernel.h>
   50 #include <sys/ktr.h>
   51 #include <sys/lock.h>
   52 #include <sys/malloc.h>
   53 #include <sys/fcntl.h>
   54 #include <sys/jail.h>
   55 #include <sys/mount.h>
   56 #include <sys/namei.h>
   57 #include <sys/proc.h>
   58 #include <sys/seqc.h>
   59 #include <sys/sdt.h>
   60 #include <sys/smr.h>
   61 #include <sys/smp.h>
   62 #include <sys/syscallsubr.h>
   63 #include <sys/sysctl.h>
   64 #include <sys/sysproto.h>
   65 #include <sys/vnode.h>
   66 #include <ck_queue.h>
   67 #ifdef KTRACE
   68 #include <sys/ktrace.h>
   69 #endif
   70 #ifdef INVARIANTS
   71 #include <machine/_inttypes.h>
   72 #endif
   73 
   74 #include <sys/capsicum.h>
   75 
   76 #include <security/audit/audit.h>
   77 #include <security/mac/mac_framework.h>
   78 
   79 #ifdef DDB
   80 #include <ddb/ddb.h>
   81 #endif
   82 
   83 #include <vm/uma.h>
   84 
   85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
   86     "Name cache");
   87 
   88 SDT_PROVIDER_DECLARE(vfs);
   89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
   90     "struct vnode *");
   91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
   92     "struct vnode *");
   93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
   94     "char *");
   95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
   96     "const char *");
   97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
   98     "struct namecache *", "int", "int");
   99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  101     "char *", "struct vnode *");
  102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  104     "struct vnode *", "char *");
  105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  106     "struct vnode *");
  107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
  108     "struct vnode *", "char *");
  109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
  110     "char *");
  111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
  112     "struct componentname *");
  113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
  114     "struct componentname *");
  115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
  116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
  117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
  118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
  119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
  120     "struct vnode *");
  121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
  122     "char *");
  123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
  124     "char *");
  125 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
  126 
  127 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
  128 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
  129 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
  130 
  131 /*
  132  * This structure describes the elements in the cache of recent
  133  * names looked up by namei.
  134  */
  135 struct negstate {
  136         u_char neg_flag;
  137         u_char neg_hit;
  138 };
  139 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
  140     "the state must fit in a union with a pointer without growing it");
  141 
  142 struct  namecache {
  143         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  144         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  145         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
  146         struct  vnode *nc_dvp;          /* vnode of parent of name */
  147         union {
  148                 struct  vnode *nu_vp;   /* vnode the name refers to */
  149                 struct  negstate nu_neg;/* negative entry state */
  150         } n_un;
  151         u_char  nc_flag;                /* flag bits */
  152         u_char  nc_nlen;                /* length of name */
  153         char    nc_name[0];             /* segment name + nul */
  154 };
  155 
  156 /*
  157  * struct namecache_ts repeats struct namecache layout up to the
  158  * nc_nlen member.
  159  * struct namecache_ts is used in place of struct namecache when time(s) need
  160  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  161  * both a non-dotdot directory name plus dotdot for the directory's
  162  * parent.
  163  *
  164  * See below for alignment requirement.
  165  */
  166 struct  namecache_ts {
  167         struct  timespec nc_time;       /* timespec provided by fs */
  168         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
  169         int     nc_ticks;               /* ticks value when entry was added */
  170         int     nc_pad;
  171         struct namecache nc_nc;
  172 };
  173 
  174 TAILQ_HEAD(cache_freebatch, namecache);
  175 
  176 /*
  177  * At least mips n32 performs 64-bit accesses to timespec as found
  178  * in namecache_ts and requires them to be aligned. Since others
  179  * may be in the same spot suffer a little bit and enforce the
  180  * alignment for everyone. Note this is a nop for 64-bit platforms.
  181  */
  182 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
  183 
  184 /*
  185  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
  186  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
  187  * smaller and the value was bumped to retain the total size, but it
  188  * was never re-evaluated for suitability. A simple test counting
  189  * lengths during package building shows that the value of 45 covers
  190  * about 86% of all added entries, reaching 99% at 65.
  191  *
  192  * Regardless of the above, use of dedicated zones instead of malloc may be
  193  * inducing additional waste. This may be hard to address as said zones are
  194  * tied to VFS SMR. Even if retaining them, the current split should be
  195  * re-evaluated.
  196  */
  197 #ifdef __LP64__
  198 #define CACHE_PATH_CUTOFF       45
  199 #define CACHE_LARGE_PAD         6
  200 #else
  201 #define CACHE_PATH_CUTOFF       41
  202 #define CACHE_LARGE_PAD         2
  203 #endif
  204 
  205 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
  206 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
  207 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
  208 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
  209 
  210 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  211 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  212 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  213 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  214 
  215 #define nc_vp           n_un.nu_vp
  216 #define nc_neg          n_un.nu_neg
  217 
  218 /*
  219  * Flags in namecache.nc_flag
  220  */
  221 #define NCF_WHITE       0x01
  222 #define NCF_ISDOTDOT    0x02
  223 #define NCF_TS          0x04
  224 #define NCF_DTS         0x08
  225 #define NCF_DVDROP      0x10
  226 #define NCF_NEGATIVE    0x20
  227 #define NCF_INVALID     0x40
  228 #define NCF_WIP         0x80
  229 
  230 /*
  231  * Flags in negstate.neg_flag
  232  */
  233 #define NEG_HOT         0x01
  234 
  235 static bool     cache_neg_evict_cond(u_long lnumcache);
  236 
  237 /*
  238  * Mark an entry as invalid.
  239  *
  240  * This is called before it starts getting deconstructed.
  241  */
  242 static void
  243 cache_ncp_invalidate(struct namecache *ncp)
  244 {
  245 
  246         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
  247             ("%s: entry %p already invalid", __func__, ncp));
  248         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
  249         atomic_thread_fence_rel();
  250 }
  251 
  252 /*
  253  * Check whether the entry can be safely used.
  254  *
  255  * All places which elide locks are supposed to call this after they are
  256  * done with reading from an entry.
  257  */
  258 #define cache_ncp_canuse(ncp)   ({                                      \
  259         struct namecache *_ncp = (ncp);                                 \
  260         u_char _nc_flag;                                                \
  261                                                                         \
  262         atomic_thread_fence_acq();                                      \
  263         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  264         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
  265 })
  266 
  267 /*
  268  * Like the above but also checks NCF_WHITE.
  269  */
  270 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
  271         struct namecache *_ncp = (ncp);                                 \
  272         u_char _nc_flag;                                                \
  273                                                                         \
  274         atomic_thread_fence_acq();                                      \
  275         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  276         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
  277 })
  278 
  279 /*
  280  * Name caching works as follows:
  281  *
  282  * Names found by directory scans are retained in a cache
  283  * for future reference.  It is managed LRU, so frequently
  284  * used names will hang around.  Cache is indexed by hash value
  285  * obtained from (dvp, name) where dvp refers to the directory
  286  * containing name.
  287  *
  288  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  289  * exist) the vnode pointer will be NULL.
  290  *
  291  * Upon reaching the last segment of a path, if the reference
  292  * is for DELETE, or NOCACHE is set (rewrite), and the
  293  * name is located in the cache, it will be dropped.
  294  *
  295  * These locks are used (in the order in which they can be taken):
  296  * NAME         TYPE    ROLE
  297  * vnodelock    mtx     vnode lists and v_cache_dd field protection
  298  * bucketlock   mtx     for access to given set of hash buckets
  299  * neglist      mtx     negative entry LRU management
  300  *
  301  * It is legal to take multiple vnodelock and bucketlock locks. The locking
  302  * order is lower address first. Both are recursive.
  303  *
  304  * "." lookups are lockless.
  305  *
  306  * ".." and vnode -> name lookups require vnodelock.
  307  *
  308  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
  309  *
  310  * Insertions and removals of entries require involved vnodes and bucketlocks
  311  * to be locked to provide safe operation against other threads modifying the
  312  * cache.
  313  *
  314  * Some lookups result in removal of the found entry (e.g. getting rid of a
  315  * negative entry with the intent to create a positive one), which poses a
  316  * problem when multiple threads reach the state. Similarly, two different
  317  * threads can purge two different vnodes and try to remove the same name.
  318  *
  319  * If the already held vnode lock is lower than the second required lock, we
  320  * can just take the other lock. However, in the opposite case, this could
  321  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
  322  * the first node, locking everything in order and revalidating the state.
  323  */
  324 
  325 VFS_SMR_DECLARE;
  326 
  327 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  328     "Name cache parameters");
  329 
  330 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
  331 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
  332     "Total namecache capacity");
  333 
  334 u_int ncsizefactor = 2;
  335 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
  336     "Size factor for namecache");
  337 
  338 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
  339 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
  340     "Ratio of negative namecache entries");
  341 
  342 /*
  343  * Negative entry % of namecache capacity above which automatic eviction is allowed.
  344  *
  345  * Check cache_neg_evict_cond for details.
  346  */
  347 static u_int ncnegminpct = 3;
  348 
  349 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
  350 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
  351     "Negative entry count above which automatic eviction is allowed");
  352 
  353 /*
  354  * Structures associated with name caching.
  355  */
  356 #define NCHHASH(hash) \
  357         (&nchashtbl[(hash) & nchash])
  358 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
  359 static u_long __read_mostly     nchash;                 /* size of hash table */
  360 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  361     "Size of namecache hash table");
  362 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
  363 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
  364 
  365 struct nchstats nchstats;               /* cache effectiveness statistics */
  366 
  367 static bool __read_frequently cache_fast_revlookup = true;
  368 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
  369     &cache_fast_revlookup, 0, "");
  370 
  371 static bool __read_mostly cache_rename_add = true;
  372 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW,
  373     &cache_rename_add, 0, "");
  374 
  375 static u_int __exclusive_cache_line neg_cycle;
  376 
  377 #define ncneghash       3
  378 #define numneglists     (ncneghash + 1)
  379 
  380 struct neglist {
  381         struct mtx              nl_evict_lock;
  382         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
  383         TAILQ_HEAD(, namecache) nl_list;
  384         TAILQ_HEAD(, namecache) nl_hotlist;
  385         u_long                  nl_hotnum;
  386 } __aligned(CACHE_LINE_SIZE);
  387 
  388 static struct neglist neglists[numneglists];
  389 
  390 static inline struct neglist *
  391 NCP2NEGLIST(struct namecache *ncp)
  392 {
  393 
  394         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
  395 }
  396 
  397 static inline struct negstate *
  398 NCP2NEGSTATE(struct namecache *ncp)
  399 {
  400 
  401         MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
  402         return (&ncp->nc_neg);
  403 }
  404 
  405 #define numbucketlocks (ncbuckethash + 1)
  406 static u_int __read_mostly  ncbuckethash;
  407 static struct mtx_padalign __read_mostly  *bucketlocks;
  408 #define HASH2BUCKETLOCK(hash) \
  409         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
  410 
  411 #define numvnodelocks (ncvnodehash + 1)
  412 static u_int __read_mostly  ncvnodehash;
  413 static struct mtx __read_mostly *vnodelocks;
  414 static inline struct mtx *
  415 VP2VNODELOCK(struct vnode *vp)
  416 {
  417 
  418         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
  419 }
  420 
  421 static void
  422 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
  423 {
  424         struct namecache_ts *ncp_ts;
  425 
  426         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
  427             (tsp == NULL && ticksp == NULL),
  428             ("No NCF_TS"));
  429 
  430         if (tsp == NULL)
  431                 return;
  432 
  433         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  434         *tsp = ncp_ts->nc_time;
  435         *ticksp = ncp_ts->nc_ticks;
  436 }
  437 
  438 #ifdef DEBUG_CACHE
  439 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
  440 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
  441     "VFS namecache enabled");
  442 #endif
  443 
  444 /* Export size information to userland */
  445 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
  446     sizeof(struct namecache), "sizeof(struct namecache)");
  447 
  448 /*
  449  * The new name cache statistics
  450  */
  451 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  452     "Name cache statistics");
  453 
  454 #define STATNODE_ULONG(name, varname, descr)                                    \
  455         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  456 #define STATNODE_COUNTER(name, varname, descr)                                  \
  457         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  458         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
  459             descr);
  460 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
  461 STATNODE_ULONG(count, numcache, "Number of cache entries");
  462 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
  463 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
  464 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
  465 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
  466 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
  467 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
  468 STATNODE_COUNTER(posszaps, numposzaps,
  469     "Number of cache hits (positive) we do not want to cache");
  470 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
  471 STATNODE_COUNTER(negzaps, numnegzaps,
  472     "Number of cache hits (negative) we do not want to cache");
  473 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
  474 /* These count for vn_getcwd(), too. */
  475 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
  476 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
  477 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
  478     "Number of fullpath search errors (VOP_VPTOCNP failures)");
  479 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
  480 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
  481 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
  482 
  483 /*
  484  * Debug or developer statistics.
  485  */
  486 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  487     "Name cache debugging");
  488 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
  489         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  490 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
  491         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  492         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
  493             descr);
  494 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
  495     "Number of successful removals after relocking");
  496 static long zap_bucket_fail;
  497 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
  498 static long zap_bucket_fail2;
  499 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
  500 static long cache_lock_vnodes_cel_3_failures;
  501 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
  502     "Number of times 3-way vnode locking failed");
  503 
  504 static void cache_zap_locked(struct namecache *ncp);
  505 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
  506     char **freebuf, size_t *buflen);
  507 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
  508     char **retbuf, size_t *buflen, size_t addend);
  509 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
  510     char **retbuf, size_t *buflen);
  511 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
  512     char **retbuf, size_t *len, size_t addend);
  513 
  514 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  515 
  516 static inline void
  517 cache_assert_vlp_locked(struct mtx *vlp)
  518 {
  519 
  520         if (vlp != NULL)
  521                 mtx_assert(vlp, MA_OWNED);
  522 }
  523 
  524 static inline void
  525 cache_assert_vnode_locked(struct vnode *vp)
  526 {
  527         struct mtx *vlp;
  528 
  529         vlp = VP2VNODELOCK(vp);
  530         cache_assert_vlp_locked(vlp);
  531 }
  532 
  533 /*
  534  * Directory vnodes with entries are held for two reasons:
  535  * 1. make them less of a target for reclamation in vnlru
  536  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
  537  *
  538  * It will be feasible to stop doing it altogether if all filesystems start
  539  * supporting lockless lookup.
  540  */
  541 static void
  542 cache_hold_vnode(struct vnode *vp)
  543 {
  544 
  545         cache_assert_vnode_locked(vp);
  546         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
  547         vhold(vp);
  548         counter_u64_add(numcachehv, 1);
  549 }
  550 
  551 static void
  552 cache_drop_vnode(struct vnode *vp)
  553 {
  554 
  555         /*
  556          * Called after all locks are dropped, meaning we can't assert
  557          * on the state of v_cache_src.
  558          */
  559         vdrop(vp);
  560         counter_u64_add(numcachehv, -1);
  561 }
  562 
  563 /*
  564  * UMA zones.
  565  */
  566 static uma_zone_t __read_mostly cache_zone_small;
  567 static uma_zone_t __read_mostly cache_zone_small_ts;
  568 static uma_zone_t __read_mostly cache_zone_large;
  569 static uma_zone_t __read_mostly cache_zone_large_ts;
  570 
  571 char *
  572 cache_symlink_alloc(size_t size, int flags)
  573 {
  574 
  575         if (size < CACHE_ZONE_SMALL_SIZE) {
  576                 return (uma_zalloc_smr(cache_zone_small, flags));
  577         }
  578         if (size < CACHE_ZONE_LARGE_SIZE) {
  579                 return (uma_zalloc_smr(cache_zone_large, flags));
  580         }
  581         counter_u64_add(symlinktoobig, 1);
  582         SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
  583         return (NULL);
  584 }
  585 
  586 void
  587 cache_symlink_free(char *string, size_t size)
  588 {
  589 
  590         MPASS(string != NULL);
  591         KASSERT(size < CACHE_ZONE_LARGE_SIZE,
  592             ("%s: size %zu too big", __func__, size));
  593 
  594         if (size < CACHE_ZONE_SMALL_SIZE) {
  595                 uma_zfree_smr(cache_zone_small, string);
  596                 return;
  597         }
  598         if (size < CACHE_ZONE_LARGE_SIZE) {
  599                 uma_zfree_smr(cache_zone_large, string);
  600                 return;
  601         }
  602         __assert_unreachable();
  603 }
  604 
  605 static struct namecache *
  606 cache_alloc_uma(int len, bool ts)
  607 {
  608         struct namecache_ts *ncp_ts;
  609         struct namecache *ncp;
  610 
  611         if (__predict_false(ts)) {
  612                 if (len <= CACHE_PATH_CUTOFF)
  613                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
  614                 else
  615                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
  616                 ncp = &ncp_ts->nc_nc;
  617         } else {
  618                 if (len <= CACHE_PATH_CUTOFF)
  619                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
  620                 else
  621                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
  622         }
  623         return (ncp);
  624 }
  625 
  626 static void
  627 cache_free_uma(struct namecache *ncp)
  628 {
  629         struct namecache_ts *ncp_ts;
  630 
  631         if (__predict_false(ncp->nc_flag & NCF_TS)) {
  632                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  633                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  634                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
  635                 else
  636                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
  637         } else {
  638                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  639                         uma_zfree_smr(cache_zone_small, ncp);
  640                 else
  641                         uma_zfree_smr(cache_zone_large, ncp);
  642         }
  643 }
  644 
  645 static struct namecache *
  646 cache_alloc(int len, bool ts)
  647 {
  648         u_long lnumcache;
  649 
  650         /*
  651          * Avoid blowout in namecache entries.
  652          *
  653          * Bugs:
  654          * 1. filesystems may end up trying to add an already existing entry
  655          * (for example this can happen after a cache miss during concurrent
  656          * lookup), in which case we will call cache_neg_evict despite not
  657          * adding anything.
  658          * 2. the routine may fail to free anything and no provisions are made
  659          * to make it try harder (see the inside for failure modes)
  660          * 3. it only ever looks at negative entries.
  661          */
  662         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
  663         if (cache_neg_evict_cond(lnumcache)) {
  664                 lnumcache = atomic_load_long(&numcache);
  665         }
  666         if (__predict_false(lnumcache >= ncsize)) {
  667                 atomic_subtract_long(&numcache, 1);
  668                 counter_u64_add(numdrops, 1);
  669                 return (NULL);
  670         }
  671         return (cache_alloc_uma(len, ts));
  672 }
  673 
  674 static void
  675 cache_free(struct namecache *ncp)
  676 {
  677 
  678         MPASS(ncp != NULL);
  679         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  680                 cache_drop_vnode(ncp->nc_dvp);
  681         }
  682         cache_free_uma(ncp);
  683         atomic_subtract_long(&numcache, 1);
  684 }
  685 
  686 static void
  687 cache_free_batch(struct cache_freebatch *batch)
  688 {
  689         struct namecache *ncp, *nnp;
  690         int i;
  691 
  692         i = 0;
  693         if (TAILQ_EMPTY(batch))
  694                 goto out;
  695         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
  696                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  697                         cache_drop_vnode(ncp->nc_dvp);
  698                 }
  699                 cache_free_uma(ncp);
  700                 i++;
  701         }
  702         atomic_subtract_long(&numcache, i);
  703 out:
  704         SDT_PROBE1(vfs, namecache, purge, batch, i);
  705 }
  706 
  707 /*
  708  * TODO: With the value stored we can do better than computing the hash based
  709  * on the address. The choice of FNV should also be revisited.
  710  */
  711 static void
  712 cache_prehash(struct vnode *vp)
  713 {
  714 
  715         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
  716 }
  717 
  718 static uint32_t
  719 cache_get_hash(char *name, u_char len, struct vnode *dvp)
  720 {
  721 
  722         return (fnv_32_buf(name, len, dvp->v_nchash));
  723 }
  724 
  725 static uint32_t
  726 cache_get_hash_iter_start(struct vnode *dvp)
  727 {
  728 
  729         return (dvp->v_nchash);
  730 }
  731 
  732 static uint32_t
  733 cache_get_hash_iter(char c, uint32_t hash)
  734 {
  735 
  736         return (fnv_32_buf(&c, 1, hash));
  737 }
  738 
  739 static uint32_t
  740 cache_get_hash_iter_finish(uint32_t hash)
  741 {
  742 
  743         return (hash);
  744 }
  745 
  746 static inline struct nchashhead *
  747 NCP2BUCKET(struct namecache *ncp)
  748 {
  749         uint32_t hash;
  750 
  751         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  752         return (NCHHASH(hash));
  753 }
  754 
  755 static inline struct mtx *
  756 NCP2BUCKETLOCK(struct namecache *ncp)
  757 {
  758         uint32_t hash;
  759 
  760         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  761         return (HASH2BUCKETLOCK(hash));
  762 }
  763 
  764 #ifdef INVARIANTS
  765 static void
  766 cache_assert_bucket_locked(struct namecache *ncp)
  767 {
  768         struct mtx *blp;
  769 
  770         blp = NCP2BUCKETLOCK(ncp);
  771         mtx_assert(blp, MA_OWNED);
  772 }
  773 
  774 static void
  775 cache_assert_bucket_unlocked(struct namecache *ncp)
  776 {
  777         struct mtx *blp;
  778 
  779         blp = NCP2BUCKETLOCK(ncp);
  780         mtx_assert(blp, MA_NOTOWNED);
  781 }
  782 #else
  783 #define cache_assert_bucket_locked(x) do { } while (0)
  784 #define cache_assert_bucket_unlocked(x) do { } while (0)
  785 #endif
  786 
  787 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
  788 static void
  789 _cache_sort_vnodes(void **p1, void **p2)
  790 {
  791         void *tmp;
  792 
  793         MPASS(*p1 != NULL || *p2 != NULL);
  794 
  795         if (*p1 > *p2) {
  796                 tmp = *p2;
  797                 *p2 = *p1;
  798                 *p1 = tmp;
  799         }
  800 }
  801 
  802 static void
  803 cache_lock_all_buckets(void)
  804 {
  805         u_int i;
  806 
  807         for (i = 0; i < numbucketlocks; i++)
  808                 mtx_lock(&bucketlocks[i]);
  809 }
  810 
  811 static void
  812 cache_unlock_all_buckets(void)
  813 {
  814         u_int i;
  815 
  816         for (i = 0; i < numbucketlocks; i++)
  817                 mtx_unlock(&bucketlocks[i]);
  818 }
  819 
  820 static void
  821 cache_lock_all_vnodes(void)
  822 {
  823         u_int i;
  824 
  825         for (i = 0; i < numvnodelocks; i++)
  826                 mtx_lock(&vnodelocks[i]);
  827 }
  828 
  829 static void
  830 cache_unlock_all_vnodes(void)
  831 {
  832         u_int i;
  833 
  834         for (i = 0; i < numvnodelocks; i++)
  835                 mtx_unlock(&vnodelocks[i]);
  836 }
  837 
  838 static int
  839 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  840 {
  841 
  842         cache_sort_vnodes(&vlp1, &vlp2);
  843 
  844         if (vlp1 != NULL) {
  845                 if (!mtx_trylock(vlp1))
  846                         return (EAGAIN);
  847         }
  848         if (!mtx_trylock(vlp2)) {
  849                 if (vlp1 != NULL)
  850                         mtx_unlock(vlp1);
  851                 return (EAGAIN);
  852         }
  853 
  854         return (0);
  855 }
  856 
  857 static void
  858 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  859 {
  860 
  861         MPASS(vlp1 != NULL || vlp2 != NULL);
  862         MPASS(vlp1 <= vlp2);
  863 
  864         if (vlp1 != NULL)
  865                 mtx_lock(vlp1);
  866         if (vlp2 != NULL)
  867                 mtx_lock(vlp2);
  868 }
  869 
  870 static void
  871 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  872 {
  873 
  874         MPASS(vlp1 != NULL || vlp2 != NULL);
  875 
  876         if (vlp1 != NULL)
  877                 mtx_unlock(vlp1);
  878         if (vlp2 != NULL)
  879                 mtx_unlock(vlp2);
  880 }
  881 
  882 static int
  883 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  884 {
  885         struct nchstats snap;
  886 
  887         if (req->oldptr == NULL)
  888                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
  889 
  890         snap = nchstats;
  891         snap.ncs_goodhits = counter_u64_fetch(numposhits);
  892         snap.ncs_neghits = counter_u64_fetch(numneghits);
  893         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
  894             counter_u64_fetch(numnegzaps);
  895         snap.ncs_miss = counter_u64_fetch(nummisszap) +
  896             counter_u64_fetch(nummiss);
  897 
  898         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
  899 }
  900 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
  901     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
  902     "VFS cache effectiveness statistics");
  903 
  904 static void
  905 cache_recalc_neg_min(u_int val)
  906 {
  907 
  908         neg_min = (ncsize * val) / 100;
  909 }
  910 
  911 static int
  912 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
  913 {
  914         u_int val;
  915         int error;
  916 
  917         val = ncnegminpct;
  918         error = sysctl_handle_int(oidp, &val, 0, req);
  919         if (error != 0 || req->newptr == NULL)
  920                 return (error);
  921 
  922         if (val == ncnegminpct)
  923                 return (0);
  924         if (val < 0 || val > 99)
  925                 return (EINVAL);
  926         ncnegminpct = val;
  927         cache_recalc_neg_min(val);
  928         return (0);
  929 }
  930 
  931 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
  932     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
  933     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
  934 
  935 #ifdef DIAGNOSTIC
  936 /*
  937  * Grab an atomic snapshot of the name cache hash chain lengths
  938  */
  939 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
  940     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
  941     "hash table stats");
  942 
  943 static int
  944 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
  945 {
  946         struct nchashhead *ncpp;
  947         struct namecache *ncp;
  948         int i, error, n_nchash, *cntbuf;
  949 
  950 retry:
  951         n_nchash = nchash + 1;  /* nchash is max index, not count */
  952         if (req->oldptr == NULL)
  953                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
  954         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
  955         cache_lock_all_buckets();
  956         if (n_nchash != nchash + 1) {
  957                 cache_unlock_all_buckets();
  958                 free(cntbuf, M_TEMP);
  959                 goto retry;
  960         }
  961         /* Scan hash tables counting entries */
  962         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
  963                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
  964                         cntbuf[i]++;
  965         cache_unlock_all_buckets();
  966         for (error = 0, i = 0; i < n_nchash; i++)
  967                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
  968                         break;
  969         free(cntbuf, M_TEMP);
  970         return (error);
  971 }
  972 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
  973     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
  974     "nchash chain lengths");
  975 
  976 static int
  977 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
  978 {
  979         int error;
  980         struct nchashhead *ncpp;
  981         struct namecache *ncp;
  982         int n_nchash;
  983         int count, maxlength, used, pct;
  984 
  985         if (!req->oldptr)
  986                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
  987 
  988         cache_lock_all_buckets();
  989         n_nchash = nchash + 1;  /* nchash is max index, not count */
  990         used = 0;
  991         maxlength = 0;
  992 
  993         /* Scan hash tables for applicable entries */
  994         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
  995                 count = 0;
  996                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
  997                         count++;
  998                 }
  999                 if (count)
 1000                         used++;
 1001                 if (maxlength < count)
 1002                         maxlength = count;
 1003         }
 1004         n_nchash = nchash + 1;
 1005         cache_unlock_all_buckets();
 1006         pct = (used * 100) / (n_nchash / 100);
 1007         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 1008         if (error)
 1009                 return (error);
 1010         error = SYSCTL_OUT(req, &used, sizeof(used));
 1011         if (error)
 1012                 return (error);
 1013         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 1014         if (error)
 1015                 return (error);
 1016         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 1017         if (error)
 1018                 return (error);
 1019         return (0);
 1020 }
 1021 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 1022     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 1023     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 1024 #endif
 1025 
 1026 /*
 1027  * Negative entries management
 1028  *
 1029  * Various workloads create plenty of negative entries and barely use them
 1030  * afterwards. Moreover malicious users can keep performing bogus lookups
 1031  * adding even more entries. For example "make tinderbox" as of writing this
 1032  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 1033  * negative.
 1034  *
 1035  * As such, a rather aggressive eviction method is needed. The currently
 1036  * employed method is a placeholder.
 1037  *
 1038  * Entries are split over numneglists separate lists, each of which is further
 1039  * split into hot and cold entries. Entries get promoted after getting a hit.
 1040  * Eviction happens on addition of new entry.
 1041  */
 1042 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1043     "Name cache negative entry statistics");
 1044 
 1045 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 1046     "Number of negative cache entries");
 1047 
 1048 static COUNTER_U64_DEFINE_EARLY(neg_created);
 1049 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 1050     "Number of created negative entries");
 1051 
 1052 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 1053 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 1054     "Number of evicted negative entries");
 1055 
 1056 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 1057 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 1058     &neg_evict_skipped_empty,
 1059     "Number of times evicting failed due to lack of entries");
 1060 
 1061 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 1062 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 1063     &neg_evict_skipped_missed,
 1064     "Number of times evicting failed due to target entry disappearing");
 1065 
 1066 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 1067 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 1068     &neg_evict_skipped_contended,
 1069     "Number of times evicting failed due to contention");
 1070 
 1071 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 1072     "Number of cache hits (negative)");
 1073 
 1074 static int
 1075 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 1076 {
 1077         int i, out;
 1078 
 1079         out = 0;
 1080         for (i = 0; i < numneglists; i++)
 1081                 out += neglists[i].nl_hotnum;
 1082 
 1083         return (SYSCTL_OUT(req, &out, sizeof(out)));
 1084 }
 1085 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 1086     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 1087     "Number of hot negative entries");
 1088 
 1089 static void
 1090 cache_neg_init(struct namecache *ncp)
 1091 {
 1092         struct negstate *ns;
 1093 
 1094         ncp->nc_flag |= NCF_NEGATIVE;
 1095         ns = NCP2NEGSTATE(ncp);
 1096         ns->neg_flag = 0;
 1097         ns->neg_hit = 0;
 1098         counter_u64_add(neg_created, 1);
 1099 }
 1100 
 1101 #define CACHE_NEG_PROMOTION_THRESH 2
 1102 
 1103 static bool
 1104 cache_neg_hit_prep(struct namecache *ncp)
 1105 {
 1106         struct negstate *ns;
 1107         u_char n;
 1108 
 1109         ns = NCP2NEGSTATE(ncp);
 1110         n = atomic_load_char(&ns->neg_hit);
 1111         for (;;) {
 1112                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 1113                         return (false);
 1114                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 1115                         break;
 1116         }
 1117         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 1118 }
 1119 
 1120 /*
 1121  * Nothing to do here but it is provided for completeness as some
 1122  * cache_neg_hit_prep callers may end up returning without even
 1123  * trying to promote.
 1124  */
 1125 #define cache_neg_hit_abort(ncp)        do { } while (0)
 1126 
 1127 static void
 1128 cache_neg_hit_finish(struct namecache *ncp)
 1129 {
 1130 
 1131         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 1132         counter_u64_add(numneghits, 1);
 1133 }
 1134 
 1135 /*
 1136  * Move a negative entry to the hot list.
 1137  */
 1138 static void
 1139 cache_neg_promote_locked(struct namecache *ncp)
 1140 {
 1141         struct neglist *nl;
 1142         struct negstate *ns;
 1143 
 1144         ns = NCP2NEGSTATE(ncp);
 1145         nl = NCP2NEGLIST(ncp);
 1146         mtx_assert(&nl->nl_lock, MA_OWNED);
 1147         if ((ns->neg_flag & NEG_HOT) == 0) {
 1148                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1149                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 1150                 nl->nl_hotnum++;
 1151                 ns->neg_flag |= NEG_HOT;
 1152         }
 1153 }
 1154 
 1155 /*
 1156  * Move a hot negative entry to the cold list.
 1157  */
 1158 static void
 1159 cache_neg_demote_locked(struct namecache *ncp)
 1160 {
 1161         struct neglist *nl;
 1162         struct negstate *ns;
 1163 
 1164         ns = NCP2NEGSTATE(ncp);
 1165         nl = NCP2NEGLIST(ncp);
 1166         mtx_assert(&nl->nl_lock, MA_OWNED);
 1167         MPASS(ns->neg_flag & NEG_HOT);
 1168         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1169         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1170         nl->nl_hotnum--;
 1171         ns->neg_flag &= ~NEG_HOT;
 1172         atomic_store_char(&ns->neg_hit, 0);
 1173 }
 1174 
 1175 /*
 1176  * Move a negative entry to the hot list if it matches the lookup.
 1177  *
 1178  * We have to take locks, but they may be contended and in the worst
 1179  * case we may need to go off CPU. We don't want to spin within the
 1180  * smr section and we can't block with it. Exiting the section means
 1181  * the found entry could have been evicted. We are going to look it
 1182  * up again.
 1183  */
 1184 static bool
 1185 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
 1186     struct namecache *oncp, uint32_t hash)
 1187 {
 1188         struct namecache *ncp;
 1189         struct neglist *nl;
 1190         u_char nc_flag;
 1191 
 1192         nl = NCP2NEGLIST(oncp);
 1193 
 1194         mtx_lock(&nl->nl_lock);
 1195         /*
 1196          * For hash iteration.
 1197          */
 1198         vfs_smr_enter();
 1199 
 1200         /*
 1201          * Avoid all surprises by only succeeding if we got the same entry and
 1202          * bailing completely otherwise.
 1203          * XXX There are no provisions to keep the vnode around, meaning we may
 1204          * end up promoting a negative entry for a *new* vnode and returning
 1205          * ENOENT on its account. This is the error we want to return anyway
 1206          * and promotion is harmless.
 1207          *
 1208          * In particular at this point there can be a new ncp which matches the
 1209          * search but hashes to a different neglist.
 1210          */
 1211         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1212                 if (ncp == oncp)
 1213                         break;
 1214         }
 1215 
 1216         /*
 1217          * No match to begin with.
 1218          */
 1219         if (__predict_false(ncp == NULL)) {
 1220                 goto out_abort;
 1221         }
 1222 
 1223         /*
 1224          * The newly found entry may be something different...
 1225          */
 1226         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1227             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
 1228                 goto out_abort;
 1229         }
 1230 
 1231         /*
 1232          * ... and not even negative.
 1233          */
 1234         nc_flag = atomic_load_char(&ncp->nc_flag);
 1235         if ((nc_flag & NCF_NEGATIVE) == 0) {
 1236                 goto out_abort;
 1237         }
 1238 
 1239         if (!cache_ncp_canuse(ncp)) {
 1240                 goto out_abort;
 1241         }
 1242 
 1243         cache_neg_promote_locked(ncp);
 1244         cache_neg_hit_finish(ncp);
 1245         vfs_smr_exit();
 1246         mtx_unlock(&nl->nl_lock);
 1247         return (true);
 1248 out_abort:
 1249         vfs_smr_exit();
 1250         mtx_unlock(&nl->nl_lock);
 1251         return (false);
 1252 }
 1253 
 1254 static void
 1255 cache_neg_promote(struct namecache *ncp)
 1256 {
 1257         struct neglist *nl;
 1258 
 1259         nl = NCP2NEGLIST(ncp);
 1260         mtx_lock(&nl->nl_lock);
 1261         cache_neg_promote_locked(ncp);
 1262         mtx_unlock(&nl->nl_lock);
 1263 }
 1264 
 1265 static void
 1266 cache_neg_insert(struct namecache *ncp)
 1267 {
 1268         struct neglist *nl;
 1269 
 1270         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1271         cache_assert_bucket_locked(ncp);
 1272         nl = NCP2NEGLIST(ncp);
 1273         mtx_lock(&nl->nl_lock);
 1274         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1275         mtx_unlock(&nl->nl_lock);
 1276         atomic_add_long(&numneg, 1);
 1277 }
 1278 
 1279 static void
 1280 cache_neg_remove(struct namecache *ncp)
 1281 {
 1282         struct neglist *nl;
 1283         struct negstate *ns;
 1284 
 1285         cache_assert_bucket_locked(ncp);
 1286         nl = NCP2NEGLIST(ncp);
 1287         ns = NCP2NEGSTATE(ncp);
 1288         mtx_lock(&nl->nl_lock);
 1289         if ((ns->neg_flag & NEG_HOT) != 0) {
 1290                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1291                 nl->nl_hotnum--;
 1292         } else {
 1293                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1294         }
 1295         mtx_unlock(&nl->nl_lock);
 1296         atomic_subtract_long(&numneg, 1);
 1297 }
 1298 
 1299 static struct neglist *
 1300 cache_neg_evict_select_list(void)
 1301 {
 1302         struct neglist *nl;
 1303         u_int c;
 1304 
 1305         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
 1306         nl = &neglists[c % numneglists];
 1307         if (!mtx_trylock(&nl->nl_evict_lock)) {
 1308                 counter_u64_add(neg_evict_skipped_contended, 1);
 1309                 return (NULL);
 1310         }
 1311         return (nl);
 1312 }
 1313 
 1314 static struct namecache *
 1315 cache_neg_evict_select_entry(struct neglist *nl)
 1316 {
 1317         struct namecache *ncp, *lncp;
 1318         struct negstate *ns, *lns;
 1319         int i;
 1320 
 1321         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
 1322         mtx_assert(&nl->nl_lock, MA_OWNED);
 1323         ncp = TAILQ_FIRST(&nl->nl_list);
 1324         if (ncp == NULL)
 1325                 return (NULL);
 1326         lncp = ncp;
 1327         lns = NCP2NEGSTATE(lncp);
 1328         for (i = 1; i < 4; i++) {
 1329                 ncp = TAILQ_NEXT(ncp, nc_dst);
 1330                 if (ncp == NULL)
 1331                         break;
 1332                 ns = NCP2NEGSTATE(ncp);
 1333                 if (ns->neg_hit < lns->neg_hit) {
 1334                         lncp = ncp;
 1335                         lns = ns;
 1336                 }
 1337         }
 1338         return (lncp);
 1339 }
 1340 
 1341 static bool
 1342 cache_neg_evict(void)
 1343 {
 1344         struct namecache *ncp, *ncp2;
 1345         struct neglist *nl;
 1346         struct vnode *dvp;
 1347         struct mtx *dvlp;
 1348         struct mtx *blp;
 1349         uint32_t hash;
 1350         u_char nlen;
 1351         bool evicted;
 1352 
 1353         nl = cache_neg_evict_select_list();
 1354         if (nl == NULL) {
 1355                 return (false);
 1356         }
 1357 
 1358         mtx_lock(&nl->nl_lock);
 1359         ncp = TAILQ_FIRST(&nl->nl_hotlist);
 1360         if (ncp != NULL) {
 1361                 cache_neg_demote_locked(ncp);
 1362         }
 1363         ncp = cache_neg_evict_select_entry(nl);
 1364         if (ncp == NULL) {
 1365                 counter_u64_add(neg_evict_skipped_empty, 1);
 1366                 mtx_unlock(&nl->nl_lock);
 1367                 mtx_unlock(&nl->nl_evict_lock);
 1368                 return (false);
 1369         }
 1370         nlen = ncp->nc_nlen;
 1371         dvp = ncp->nc_dvp;
 1372         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
 1373         dvlp = VP2VNODELOCK(dvp);
 1374         blp = HASH2BUCKETLOCK(hash);
 1375         mtx_unlock(&nl->nl_lock);
 1376         mtx_unlock(&nl->nl_evict_lock);
 1377         mtx_lock(dvlp);
 1378         mtx_lock(blp);
 1379         /*
 1380          * Note that since all locks were dropped above, the entry may be
 1381          * gone or reallocated to be something else.
 1382          */
 1383         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
 1384                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
 1385                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
 1386                         break;
 1387         }
 1388         if (ncp2 == NULL) {
 1389                 counter_u64_add(neg_evict_skipped_missed, 1);
 1390                 ncp = NULL;
 1391                 evicted = false;
 1392         } else {
 1393                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
 1394                 MPASS(blp == NCP2BUCKETLOCK(ncp));
 1395                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
 1396                     ncp->nc_name);
 1397                 cache_zap_locked(ncp);
 1398                 counter_u64_add(neg_evicted, 1);
 1399                 evicted = true;
 1400         }
 1401         mtx_unlock(blp);
 1402         mtx_unlock(dvlp);
 1403         if (ncp != NULL)
 1404                 cache_free(ncp);
 1405         return (evicted);
 1406 }
 1407 
 1408 /*
 1409  * Maybe evict a negative entry to create more room.
 1410  *
 1411  * The ncnegfactor parameter limits what fraction of the total count
 1412  * can comprise of negative entries. However, if the cache is just
 1413  * warming up this leads to excessive evictions.  As such, ncnegminpct
 1414  * (recomputed to neg_min) dictates whether the above should be
 1415  * applied.
 1416  *
 1417  * Try evicting if the cache is close to full capacity regardless of
 1418  * other considerations.
 1419  */
 1420 static bool
 1421 cache_neg_evict_cond(u_long lnumcache)
 1422 {
 1423         u_long lnumneg;
 1424 
 1425         if (ncsize - 1000 < lnumcache)
 1426                 goto out_evict;
 1427         lnumneg = atomic_load_long(&numneg);
 1428         if (lnumneg < neg_min)
 1429                 return (false);
 1430         if (lnumneg * ncnegfactor < lnumcache)
 1431                 return (false);
 1432 out_evict:
 1433         return (cache_neg_evict());
 1434 }
 1435 
 1436 /*
 1437  * cache_zap_locked():
 1438  *
 1439  *   Removes a namecache entry from cache, whether it contains an actual
 1440  *   pointer to a vnode or if it is just a negative cache entry.
 1441  */
 1442 static void
 1443 cache_zap_locked(struct namecache *ncp)
 1444 {
 1445         struct nchashhead *ncpp;
 1446         struct vnode *dvp, *vp;
 1447 
 1448         dvp = ncp->nc_dvp;
 1449         vp = ncp->nc_vp;
 1450 
 1451         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1452                 cache_assert_vnode_locked(vp);
 1453         cache_assert_vnode_locked(dvp);
 1454         cache_assert_bucket_locked(ncp);
 1455 
 1456         cache_ncp_invalidate(ncp);
 1457 
 1458         ncpp = NCP2BUCKET(ncp);
 1459         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 1460         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1461                 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
 1462                 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
 1463                 if (ncp == vp->v_cache_dd) {
 1464                         atomic_store_ptr(&vp->v_cache_dd, NULL);
 1465                 }
 1466         } else {
 1467                 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
 1468                 cache_neg_remove(ncp);
 1469         }
 1470         if (ncp->nc_flag & NCF_ISDOTDOT) {
 1471                 if (ncp == dvp->v_cache_dd) {
 1472                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1473                 }
 1474         } else {
 1475                 LIST_REMOVE(ncp, nc_src);
 1476                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 1477                         ncp->nc_flag |= NCF_DVDROP;
 1478                 }
 1479         }
 1480 }
 1481 
 1482 static void
 1483 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 1484 {
 1485         struct mtx *blp;
 1486 
 1487         MPASS(ncp->nc_dvp == vp);
 1488         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1489         cache_assert_vnode_locked(vp);
 1490 
 1491         blp = NCP2BUCKETLOCK(ncp);
 1492         mtx_lock(blp);
 1493         cache_zap_locked(ncp);
 1494         mtx_unlock(blp);
 1495 }
 1496 
 1497 static bool
 1498 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
 1499     struct mtx **vlpp)
 1500 {
 1501         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 1502         struct mtx *blp;
 1503 
 1504         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 1505         cache_assert_vnode_locked(vp);
 1506 
 1507         if (ncp->nc_flag & NCF_NEGATIVE) {
 1508                 if (*vlpp != NULL) {
 1509                         mtx_unlock(*vlpp);
 1510                         *vlpp = NULL;
 1511                 }
 1512                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 1513                 return (true);
 1514         }
 1515 
 1516         pvlp = VP2VNODELOCK(vp);
 1517         blp = NCP2BUCKETLOCK(ncp);
 1518         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
 1519         vlp2 = VP2VNODELOCK(ncp->nc_vp);
 1520 
 1521         if (*vlpp == vlp1 || *vlpp == vlp2) {
 1522                 to_unlock = *vlpp;
 1523                 *vlpp = NULL;
 1524         } else {
 1525                 if (*vlpp != NULL) {
 1526                         mtx_unlock(*vlpp);
 1527                         *vlpp = NULL;
 1528                 }
 1529                 cache_sort_vnodes(&vlp1, &vlp2);
 1530                 if (vlp1 == pvlp) {
 1531                         mtx_lock(vlp2);
 1532                         to_unlock = vlp2;
 1533                 } else {
 1534                         if (!mtx_trylock(vlp1))
 1535                                 goto out_relock;
 1536                         to_unlock = vlp1;
 1537                 }
 1538         }
 1539         mtx_lock(blp);
 1540         cache_zap_locked(ncp);
 1541         mtx_unlock(blp);
 1542         if (to_unlock != NULL)
 1543                 mtx_unlock(to_unlock);
 1544         return (true);
 1545 
 1546 out_relock:
 1547         mtx_unlock(vlp2);
 1548         mtx_lock(vlp1);
 1549         mtx_lock(vlp2);
 1550         MPASS(*vlpp == NULL);
 1551         *vlpp = vlp1;
 1552         return (false);
 1553 }
 1554 
 1555 /*
 1556  * If trylocking failed we can get here. We know enough to take all needed locks
 1557  * in the right order and re-lookup the entry.
 1558  */
 1559 static int
 1560 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
 1561     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
 1562     struct mtx *blp)
 1563 {
 1564         struct namecache *rncp;
 1565 
 1566         cache_assert_bucket_unlocked(ncp);
 1567 
 1568         cache_sort_vnodes(&dvlp, &vlp);
 1569         cache_lock_vnodes(dvlp, vlp);
 1570         mtx_lock(blp);
 1571         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 1572                 if (rncp == ncp && rncp->nc_dvp == dvp &&
 1573                     rncp->nc_nlen == cnp->cn_namelen &&
 1574                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
 1575                         break;
 1576         }
 1577         if (rncp != NULL) {
 1578                 cache_zap_locked(rncp);
 1579                 mtx_unlock(blp);
 1580                 cache_unlock_vnodes(dvlp, vlp);
 1581                 counter_u64_add(zap_bucket_relock_success, 1);
 1582                 return (0);
 1583         }
 1584 
 1585         mtx_unlock(blp);
 1586         cache_unlock_vnodes(dvlp, vlp);
 1587         return (EAGAIN);
 1588 }
 1589 
 1590 static int __noinline
 1591 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
 1592     uint32_t hash, struct mtx *blp)
 1593 {
 1594         struct mtx *dvlp, *vlp;
 1595         struct vnode *dvp;
 1596 
 1597         cache_assert_bucket_locked(ncp);
 1598 
 1599         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1600         vlp = NULL;
 1601         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1602                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1603         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1604                 cache_zap_locked(ncp);
 1605                 mtx_unlock(blp);
 1606                 cache_unlock_vnodes(dvlp, vlp);
 1607                 return (0);
 1608         }
 1609 
 1610         dvp = ncp->nc_dvp;
 1611         mtx_unlock(blp);
 1612         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 1613 }
 1614 
 1615 static __noinline int
 1616 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
 1617 {
 1618         struct namecache *ncp;
 1619         struct mtx *blp;
 1620         struct mtx *dvlp, *dvlp2;
 1621         uint32_t hash;
 1622         int error;
 1623 
 1624         if (cnp->cn_namelen == 2 &&
 1625             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1626                 dvlp = VP2VNODELOCK(dvp);
 1627                 dvlp2 = NULL;
 1628                 mtx_lock(dvlp);
 1629 retry_dotdot:
 1630                 ncp = dvp->v_cache_dd;
 1631                 if (ncp == NULL) {
 1632                         mtx_unlock(dvlp);
 1633                         if (dvlp2 != NULL)
 1634                                 mtx_unlock(dvlp2);
 1635                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1636                         return (0);
 1637                 }
 1638                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1639                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
 1640                                 goto retry_dotdot;
 1641                         MPASS(dvp->v_cache_dd == NULL);
 1642                         mtx_unlock(dvlp);
 1643                         if (dvlp2 != NULL)
 1644                                 mtx_unlock(dvlp2);
 1645                         cache_free(ncp);
 1646                 } else {
 1647                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1648                         mtx_unlock(dvlp);
 1649                         if (dvlp2 != NULL)
 1650                                 mtx_unlock(dvlp2);
 1651                 }
 1652                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1653                 return (1);
 1654         }
 1655 
 1656         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1657         blp = HASH2BUCKETLOCK(hash);
 1658 retry:
 1659         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 1660                 goto out_no_entry;
 1661 
 1662         mtx_lock(blp);
 1663 
 1664         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1665                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1666                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1667                         break;
 1668         }
 1669 
 1670         if (ncp == NULL) {
 1671                 mtx_unlock(blp);
 1672                 goto out_no_entry;
 1673         }
 1674 
 1675         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 1676         if (__predict_false(error != 0)) {
 1677                 zap_bucket_fail++;
 1678                 goto retry;
 1679         }
 1680         counter_u64_add(numposzaps, 1);
 1681         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1682         cache_free(ncp);
 1683         return (1);
 1684 out_no_entry:
 1685         counter_u64_add(nummisszap, 1);
 1686         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1687         return (0);
 1688 }
 1689 
 1690 static int __noinline
 1691 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1692     struct timespec *tsp, int *ticksp)
 1693 {
 1694         int ltype;
 1695 
 1696         *vpp = dvp;
 1697         counter_u64_add(dothits, 1);
 1698         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 1699         if (tsp != NULL)
 1700                 timespecclear(tsp);
 1701         if (ticksp != NULL)
 1702                 *ticksp = ticks;
 1703         vrefact(*vpp);
 1704         /*
 1705          * When we lookup "." we still can be asked to lock it
 1706          * differently...
 1707          */
 1708         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 1709         if (ltype != VOP_ISLOCKED(*vpp)) {
 1710                 if (ltype == LK_EXCLUSIVE) {
 1711                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 1712                         if (VN_IS_DOOMED((*vpp))) {
 1713                                 /* forced unmount */
 1714                                 vrele(*vpp);
 1715                                 *vpp = NULL;
 1716                                 return (ENOENT);
 1717                         }
 1718                 } else
 1719                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 1720         }
 1721         return (-1);
 1722 }
 1723 
 1724 static int __noinline
 1725 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1726     struct timespec *tsp, int *ticksp)
 1727 {
 1728         struct namecache_ts *ncp_ts;
 1729         struct namecache *ncp;
 1730         struct mtx *dvlp;
 1731         enum vgetstate vs;
 1732         int error, ltype;
 1733         bool whiteout;
 1734 
 1735         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
 1736 
 1737         if ((cnp->cn_flags & MAKEENTRY) == 0) {
 1738                 cache_remove_cnp(dvp, cnp);
 1739                 return (0);
 1740         }
 1741 
 1742         counter_u64_add(dotdothits, 1);
 1743 retry:
 1744         dvlp = VP2VNODELOCK(dvp);
 1745         mtx_lock(dvlp);
 1746         ncp = dvp->v_cache_dd;
 1747         if (ncp == NULL) {
 1748                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
 1749                 mtx_unlock(dvlp);
 1750                 return (0);
 1751         }
 1752         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1753                 if (ncp->nc_flag & NCF_NEGATIVE)
 1754                         *vpp = NULL;
 1755                 else
 1756                         *vpp = ncp->nc_vp;
 1757         } else
 1758                 *vpp = ncp->nc_dvp;
 1759         if (*vpp == NULL)
 1760                 goto negative_success;
 1761         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
 1762         cache_out_ts(ncp, tsp, ticksp);
 1763         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 1764             NCF_DTS && tsp != NULL) {
 1765                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1766                 *tsp = ncp_ts->nc_dotdottime;
 1767         }
 1768 
 1769         MPASS(dvp != *vpp);
 1770         ltype = VOP_ISLOCKED(dvp);
 1771         VOP_UNLOCK(dvp);
 1772         vs = vget_prep(*vpp);
 1773         mtx_unlock(dvlp);
 1774         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1775         vn_lock(dvp, ltype | LK_RETRY);
 1776         if (VN_IS_DOOMED(dvp)) {
 1777                 if (error == 0)
 1778                         vput(*vpp);
 1779                 *vpp = NULL;
 1780                 return (ENOENT);
 1781         }
 1782         if (error) {
 1783                 *vpp = NULL;
 1784                 goto retry;
 1785         }
 1786         return (-1);
 1787 negative_success:
 1788         if (__predict_false(cnp->cn_nameiop == CREATE)) {
 1789                 if (cnp->cn_flags & ISLASTCN) {
 1790                         counter_u64_add(numnegzaps, 1);
 1791                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
 1792                         mtx_unlock(dvlp);
 1793                         cache_free(ncp);
 1794                         return (0);
 1795                 }
 1796         }
 1797 
 1798         whiteout = (ncp->nc_flag & NCF_WHITE);
 1799         cache_out_ts(ncp, tsp, ticksp);
 1800         if (cache_neg_hit_prep(ncp))
 1801                 cache_neg_promote(ncp);
 1802         else
 1803                 cache_neg_hit_finish(ncp);
 1804         mtx_unlock(dvlp);
 1805         if (whiteout)
 1806                 cnp->cn_flags |= ISWHITEOUT;
 1807         return (ENOENT);
 1808 }
 1809 
 1810 /**
 1811  * Lookup a name in the name cache
 1812  *
 1813  * # Arguments
 1814  *
 1815  * - dvp:       Parent directory in which to search.
 1816  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
 1817  * - cnp:       Parameters of the name search.  The most interesting bits of
 1818  *              the cn_flags field have the following meanings:
 1819  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
 1820  *                      it up.
 1821  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
 1822  * - tsp:       Return storage for cache timestamp.  On a successful (positive
 1823  *              or negative) lookup, tsp will be filled with any timespec that
 1824  *              was stored when this cache entry was created.  However, it will
 1825  *              be clear for "." entries.
 1826  * - ticks:     Return storage for alternate cache timestamp.  On a successful
 1827  *              (positive or negative) lookup, it will contain the ticks value
 1828  *              that was current when the cache entry was created, unless cnp
 1829  *              was ".".
 1830  *
 1831  * Either both tsp and ticks have to be provided or neither of them.
 1832  *
 1833  * # Returns
 1834  *
 1835  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
 1836  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
 1837  *              to a forced unmount.  vpp will not be modified.  If the entry
 1838  *              is a whiteout, then the ISWHITEOUT flag will be set in
 1839  *              cnp->cn_flags.
 1840  * - 0:         A cache miss.  vpp will not be modified.
 1841  *
 1842  * # Locking
 1843  *
 1844  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
 1845  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
 1846  * lock is not recursively acquired.
 1847  */
 1848 static int __noinline
 1849 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1850     struct timespec *tsp, int *ticksp)
 1851 {
 1852         struct namecache *ncp;
 1853         struct mtx *blp;
 1854         uint32_t hash;
 1855         enum vgetstate vs;
 1856         int error;
 1857         bool whiteout;
 1858 
 1859         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 1860         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
 1861 
 1862 retry:
 1863         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1864         blp = HASH2BUCKETLOCK(hash);
 1865         mtx_lock(blp);
 1866 
 1867         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1868                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1869                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1870                         break;
 1871         }
 1872 
 1873         if (__predict_false(ncp == NULL)) {
 1874                 mtx_unlock(blp);
 1875                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 1876                     NULL);
 1877                 counter_u64_add(nummiss, 1);
 1878                 return (0);
 1879         }
 1880 
 1881         if (ncp->nc_flag & NCF_NEGATIVE)
 1882                 goto negative_success;
 1883 
 1884         counter_u64_add(numposhits, 1);
 1885         *vpp = ncp->nc_vp;
 1886         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 1887         cache_out_ts(ncp, tsp, ticksp);
 1888         MPASS(dvp != *vpp);
 1889         vs = vget_prep(*vpp);
 1890         mtx_unlock(blp);
 1891         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1892         if (error) {
 1893                 *vpp = NULL;
 1894                 goto retry;
 1895         }
 1896         return (-1);
 1897 negative_success:
 1898         /*
 1899          * We don't get here with regular lookup apart from corner cases.
 1900          */
 1901         if (__predict_true(cnp->cn_nameiop == CREATE)) {
 1902                 if (cnp->cn_flags & ISLASTCN) {
 1903                         counter_u64_add(numnegzaps, 1);
 1904                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 1905                         if (__predict_false(error != 0)) {
 1906                                 zap_bucket_fail2++;
 1907                                 goto retry;
 1908                         }
 1909                         cache_free(ncp);
 1910                         return (0);
 1911                 }
 1912         }
 1913 
 1914         whiteout = (ncp->nc_flag & NCF_WHITE);
 1915         cache_out_ts(ncp, tsp, ticksp);
 1916         if (cache_neg_hit_prep(ncp))
 1917                 cache_neg_promote(ncp);
 1918         else
 1919                 cache_neg_hit_finish(ncp);
 1920         mtx_unlock(blp);
 1921         if (whiteout)
 1922                 cnp->cn_flags |= ISWHITEOUT;
 1923         return (ENOENT);
 1924 }
 1925 
 1926 int
 1927 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1928     struct timespec *tsp, int *ticksp)
 1929 {
 1930         struct namecache *ncp;
 1931         uint32_t hash;
 1932         enum vgetstate vs;
 1933         int error;
 1934         bool whiteout, neg_promote;
 1935         u_short nc_flag;
 1936 
 1937         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
 1938 
 1939 #ifdef DEBUG_CACHE
 1940         if (__predict_false(!doingcache)) {
 1941                 cnp->cn_flags &= ~MAKEENTRY;
 1942                 return (0);
 1943         }
 1944 #endif
 1945 
 1946         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 1947                 if (cnp->cn_namelen == 1)
 1948                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 1949                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
 1950                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
 1951         }
 1952 
 1953         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 1954 
 1955         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
 1956                 cache_remove_cnp(dvp, cnp);
 1957                 return (0);
 1958         }
 1959 
 1960         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1961         vfs_smr_enter();
 1962 
 1963         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1964                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1965                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1966                         break;
 1967         }
 1968 
 1969         if (__predict_false(ncp == NULL)) {
 1970                 vfs_smr_exit();
 1971                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 1972                     NULL);
 1973                 counter_u64_add(nummiss, 1);
 1974                 return (0);
 1975         }
 1976 
 1977         nc_flag = atomic_load_char(&ncp->nc_flag);
 1978         if (nc_flag & NCF_NEGATIVE)
 1979                 goto negative_success;
 1980 
 1981         counter_u64_add(numposhits, 1);
 1982         *vpp = ncp->nc_vp;
 1983         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 1984         cache_out_ts(ncp, tsp, ticksp);
 1985         MPASS(dvp != *vpp);
 1986         if (!cache_ncp_canuse(ncp)) {
 1987                 vfs_smr_exit();
 1988                 *vpp = NULL;
 1989                 goto out_fallback;
 1990         }
 1991         vs = vget_prep_smr(*vpp);
 1992         vfs_smr_exit();
 1993         if (__predict_false(vs == VGET_NONE)) {
 1994                 *vpp = NULL;
 1995                 goto out_fallback;
 1996         }
 1997         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1998         if (error) {
 1999                 *vpp = NULL;
 2000                 goto out_fallback;
 2001         }
 2002         return (-1);
 2003 negative_success:
 2004         if (cnp->cn_nameiop == CREATE) {
 2005                 if (cnp->cn_flags & ISLASTCN) {
 2006                         vfs_smr_exit();
 2007                         goto out_fallback;
 2008                 }
 2009         }
 2010 
 2011         cache_out_ts(ncp, tsp, ticksp);
 2012         whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
 2013         neg_promote = cache_neg_hit_prep(ncp);
 2014         if (!cache_ncp_canuse(ncp)) {
 2015                 cache_neg_hit_abort(ncp);
 2016                 vfs_smr_exit();
 2017                 goto out_fallback;
 2018         }
 2019         if (neg_promote) {
 2020                 vfs_smr_exit();
 2021                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
 2022                         goto out_fallback;
 2023         } else {
 2024                 cache_neg_hit_finish(ncp);
 2025                 vfs_smr_exit();
 2026         }
 2027         if (whiteout)
 2028                 cnp->cn_flags |= ISWHITEOUT;
 2029         return (ENOENT);
 2030 out_fallback:
 2031         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
 2032 }
 2033 
 2034 struct celockstate {
 2035         struct mtx *vlp[3];
 2036         struct mtx *blp[2];
 2037 };
 2038 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 2039 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 2040 
 2041 static inline void
 2042 cache_celockstate_init(struct celockstate *cel)
 2043 {
 2044 
 2045         bzero(cel, sizeof(*cel));
 2046 }
 2047 
 2048 static void
 2049 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
 2050     struct vnode *dvp)
 2051 {
 2052         struct mtx *vlp1, *vlp2;
 2053 
 2054         MPASS(cel->vlp[0] == NULL);
 2055         MPASS(cel->vlp[1] == NULL);
 2056         MPASS(cel->vlp[2] == NULL);
 2057 
 2058         MPASS(vp != NULL || dvp != NULL);
 2059 
 2060         vlp1 = VP2VNODELOCK(vp);
 2061         vlp2 = VP2VNODELOCK(dvp);
 2062         cache_sort_vnodes(&vlp1, &vlp2);
 2063 
 2064         if (vlp1 != NULL) {
 2065                 mtx_lock(vlp1);
 2066                 cel->vlp[0] = vlp1;
 2067         }
 2068         mtx_lock(vlp2);
 2069         cel->vlp[1] = vlp2;
 2070 }
 2071 
 2072 static void
 2073 cache_unlock_vnodes_cel(struct celockstate *cel)
 2074 {
 2075 
 2076         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 2077 
 2078         if (cel->vlp[0] != NULL)
 2079                 mtx_unlock(cel->vlp[0]);
 2080         if (cel->vlp[1] != NULL)
 2081                 mtx_unlock(cel->vlp[1]);
 2082         if (cel->vlp[2] != NULL)
 2083                 mtx_unlock(cel->vlp[2]);
 2084 }
 2085 
 2086 static bool
 2087 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 2088 {
 2089         struct mtx *vlp;
 2090         bool ret;
 2091 
 2092         cache_assert_vlp_locked(cel->vlp[0]);
 2093         cache_assert_vlp_locked(cel->vlp[1]);
 2094         MPASS(cel->vlp[2] == NULL);
 2095 
 2096         MPASS(vp != NULL);
 2097         vlp = VP2VNODELOCK(vp);
 2098 
 2099         ret = true;
 2100         if (vlp >= cel->vlp[1]) {
 2101                 mtx_lock(vlp);
 2102         } else {
 2103                 if (mtx_trylock(vlp))
 2104                         goto out;
 2105                 cache_lock_vnodes_cel_3_failures++;
 2106                 cache_unlock_vnodes_cel(cel);
 2107                 if (vlp < cel->vlp[0]) {
 2108                         mtx_lock(vlp);
 2109                         mtx_lock(cel->vlp[0]);
 2110                         mtx_lock(cel->vlp[1]);
 2111                 } else {
 2112                         if (cel->vlp[0] != NULL)
 2113                                 mtx_lock(cel->vlp[0]);
 2114                         mtx_lock(vlp);
 2115                         mtx_lock(cel->vlp[1]);
 2116                 }
 2117                 ret = false;
 2118         }
 2119 out:
 2120         cel->vlp[2] = vlp;
 2121         return (ret);
 2122 }
 2123 
 2124 static void
 2125 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
 2126     struct mtx *blp2)
 2127 {
 2128 
 2129         MPASS(cel->blp[0] == NULL);
 2130         MPASS(cel->blp[1] == NULL);
 2131 
 2132         cache_sort_vnodes(&blp1, &blp2);
 2133 
 2134         if (blp1 != NULL) {
 2135                 mtx_lock(blp1);
 2136                 cel->blp[0] = blp1;
 2137         }
 2138         mtx_lock(blp2);
 2139         cel->blp[1] = blp2;
 2140 }
 2141 
 2142 static void
 2143 cache_unlock_buckets_cel(struct celockstate *cel)
 2144 {
 2145 
 2146         if (cel->blp[0] != NULL)
 2147                 mtx_unlock(cel->blp[0]);
 2148         mtx_unlock(cel->blp[1]);
 2149 }
 2150 
 2151 /*
 2152  * Lock part of the cache affected by the insertion.
 2153  *
 2154  * This means vnodelocks for dvp, vp and the relevant bucketlock.
 2155  * However, insertion can result in removal of an old entry. In this
 2156  * case we have an additional vnode and bucketlock pair to lock.
 2157  *
 2158  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
 2159  * preserving the locking order (smaller address first).
 2160  */
 2161 static void
 2162 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2163     uint32_t hash)
 2164 {
 2165         struct namecache *ncp;
 2166         struct mtx *blps[2];
 2167         u_char nc_flag;
 2168 
 2169         blps[0] = HASH2BUCKETLOCK(hash);
 2170         for (;;) {
 2171                 blps[1] = NULL;
 2172                 cache_lock_vnodes_cel(cel, dvp, vp);
 2173                 if (vp == NULL || vp->v_type != VDIR)
 2174                         break;
 2175                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 2176                 if (ncp == NULL)
 2177                         break;
 2178                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2179                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2180                         break;
 2181                 MPASS(ncp->nc_dvp == vp);
 2182                 blps[1] = NCP2BUCKETLOCK(ncp);
 2183                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2184                         break;
 2185                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2186                         break;
 2187                 /*
 2188                  * All vnodes got re-locked. Re-validate the state and if
 2189                  * nothing changed we are done. Otherwise restart.
 2190                  */
 2191                 if (ncp == vp->v_cache_dd &&
 2192                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2193                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2194                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2195                         break;
 2196                 cache_unlock_vnodes_cel(cel);
 2197                 cel->vlp[0] = NULL;
 2198                 cel->vlp[1] = NULL;
 2199                 cel->vlp[2] = NULL;
 2200         }
 2201         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2202 }
 2203 
 2204 static void
 2205 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2206     uint32_t hash)
 2207 {
 2208         struct namecache *ncp;
 2209         struct mtx *blps[2];
 2210         u_char nc_flag;
 2211 
 2212         blps[0] = HASH2BUCKETLOCK(hash);
 2213         for (;;) {
 2214                 blps[1] = NULL;
 2215                 cache_lock_vnodes_cel(cel, dvp, vp);
 2216                 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 2217                 if (ncp == NULL)
 2218                         break;
 2219                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2220                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2221                         break;
 2222                 MPASS(ncp->nc_dvp == dvp);
 2223                 blps[1] = NCP2BUCKETLOCK(ncp);
 2224                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2225                         break;
 2226                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2227                         break;
 2228                 if (ncp == dvp->v_cache_dd &&
 2229                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2230                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2231                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2232                         break;
 2233                 cache_unlock_vnodes_cel(cel);
 2234                 cel->vlp[0] = NULL;
 2235                 cel->vlp[1] = NULL;
 2236                 cel->vlp[2] = NULL;
 2237         }
 2238         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2239 }
 2240 
 2241 static void
 2242 cache_enter_unlock(struct celockstate *cel)
 2243 {
 2244 
 2245         cache_unlock_buckets_cel(cel);
 2246         cache_unlock_vnodes_cel(cel);
 2247 }
 2248 
 2249 static void __noinline
 2250 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
 2251     struct componentname *cnp)
 2252 {
 2253         struct celockstate cel;
 2254         struct namecache *ncp;
 2255         uint32_t hash;
 2256         int len;
 2257 
 2258         if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
 2259                 return;
 2260         len = cnp->cn_namelen;
 2261         cache_celockstate_init(&cel);
 2262         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2263         cache_enter_lock_dd(&cel, dvp, vp, hash);
 2264         ncp = dvp->v_cache_dd;
 2265         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
 2266                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
 2267                 cache_zap_locked(ncp);
 2268         } else {
 2269                 ncp = NULL;
 2270         }
 2271         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 2272         cache_enter_unlock(&cel);
 2273         if (ncp != NULL)
 2274                 cache_free(ncp);
 2275 }
 2276 
 2277 /*
 2278  * Add an entry to the cache.
 2279  */
 2280 void
 2281 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 2282     struct timespec *tsp, struct timespec *dtsp)
 2283 {
 2284         struct celockstate cel;
 2285         struct namecache *ncp, *n2, *ndd;
 2286         struct namecache_ts *ncp_ts;
 2287         struct nchashhead *ncpp;
 2288         uint32_t hash;
 2289         int flag;
 2290         int len;
 2291 
 2292         KASSERT(cnp->cn_namelen <= NAME_MAX,
 2293             ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
 2294             NAME_MAX));
 2295         VNPASS(dvp != vp, dvp);
 2296         VNPASS(!VN_IS_DOOMED(dvp), dvp);
 2297         VNPASS(dvp->v_type != VNON, dvp);
 2298         if (vp != NULL) {
 2299                 VNPASS(!VN_IS_DOOMED(vp), vp);
 2300                 VNPASS(vp->v_type != VNON, vp);
 2301         }
 2302 
 2303 #ifdef DEBUG_CACHE
 2304         if (__predict_false(!doingcache))
 2305                 return;
 2306 #endif
 2307 
 2308         flag = 0;
 2309         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 2310                 if (cnp->cn_namelen == 1)
 2311                         return;
 2312                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 2313                         cache_enter_dotdot_prep(dvp, vp, cnp);
 2314                         flag = NCF_ISDOTDOT;
 2315                 }
 2316         }
 2317 
 2318         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 2319         if (ncp == NULL)
 2320                 return;
 2321 
 2322         cache_celockstate_init(&cel);
 2323         ndd = NULL;
 2324         ncp_ts = NULL;
 2325 
 2326         /*
 2327          * Calculate the hash key and setup as much of the new
 2328          * namecache entry as possible before acquiring the lock.
 2329          */
 2330         ncp->nc_flag = flag | NCF_WIP;
 2331         ncp->nc_vp = vp;
 2332         if (vp == NULL)
 2333                 cache_neg_init(ncp);
 2334         ncp->nc_dvp = dvp;
 2335         if (tsp != NULL) {
 2336                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 2337                 ncp_ts->nc_time = *tsp;
 2338                 ncp_ts->nc_ticks = ticks;
 2339                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
 2340                 if (dtsp != NULL) {
 2341                         ncp_ts->nc_dotdottime = *dtsp;
 2342                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 2343                 }
 2344         }
 2345         len = ncp->nc_nlen = cnp->cn_namelen;
 2346         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2347         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
 2348         ncp->nc_name[len] = '\0';
 2349         cache_enter_lock(&cel, dvp, vp, hash);
 2350 
 2351         /*
 2352          * See if this vnode or negative entry is already in the cache
 2353          * with this name.  This can happen with concurrent lookups of
 2354          * the same path name.
 2355          */
 2356         ncpp = NCHHASH(hash);
 2357         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
 2358                 if (n2->nc_dvp == dvp &&
 2359                     n2->nc_nlen == cnp->cn_namelen &&
 2360                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 2361                         MPASS(cache_ncp_canuse(n2));
 2362                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
 2363                                 KASSERT(vp == NULL,
 2364                                     ("%s: found entry pointing to a different vnode (%p != %p)",
 2365                                     __func__, NULL, vp));
 2366                         else
 2367                                 KASSERT(n2->nc_vp == vp,
 2368                                     ("%s: found entry pointing to a different vnode (%p != %p)",
 2369                                     __func__, n2->nc_vp, vp));
 2370                         /*
 2371                          * Entries are supposed to be immutable unless in the
 2372                          * process of getting destroyed. Accommodating for
 2373                          * changing timestamps is possible but not worth it.
 2374                          * This should be harmless in terms of correctness, in
 2375                          * the worst case resulting in an earlier expiration.
 2376                          * Alternatively, the found entry can be replaced
 2377                          * altogether.
 2378                          */
 2379                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
 2380 #if 0
 2381                         if (tsp != NULL) {
 2382                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
 2383                                     ("no NCF_TS"));
 2384                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 2385                                 n2_ts->nc_time = ncp_ts->nc_time;
 2386                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
 2387                                 if (dtsp != NULL) {
 2388                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 2389                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
 2390                                 }
 2391                         }
 2392 #endif
 2393                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
 2394                             vp);
 2395                         goto out_unlock_free;
 2396                 }
 2397         }
 2398 
 2399         if (flag == NCF_ISDOTDOT) {
 2400                 /*
 2401                  * See if we are trying to add .. entry, but some other lookup
 2402                  * has populated v_cache_dd pointer already.
 2403                  */
 2404                 if (dvp->v_cache_dd != NULL)
 2405                         goto out_unlock_free;
 2406                 KASSERT(vp == NULL || vp->v_type == VDIR,
 2407                     ("wrong vnode type %p", vp));
 2408                 atomic_thread_fence_rel();
 2409                 atomic_store_ptr(&dvp->v_cache_dd, ncp);
 2410         }
 2411 
 2412         if (vp != NULL) {
 2413                 if (flag != NCF_ISDOTDOT) {
 2414                         /*
 2415                          * For this case, the cache entry maps both the
 2416                          * directory name in it and the name ".." for the
 2417                          * directory's parent.
 2418                          */
 2419                         if ((ndd = vp->v_cache_dd) != NULL) {
 2420                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 2421                                         cache_zap_locked(ndd);
 2422                                 else
 2423                                         ndd = NULL;
 2424                         }
 2425                         atomic_thread_fence_rel();
 2426                         atomic_store_ptr(&vp->v_cache_dd, ncp);
 2427                 } else if (vp->v_type != VDIR) {
 2428                         if (vp->v_cache_dd != NULL) {
 2429                                 atomic_store_ptr(&vp->v_cache_dd, NULL);
 2430                         }
 2431                 }
 2432         }
 2433 
 2434         if (flag != NCF_ISDOTDOT) {
 2435                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 2436                         cache_hold_vnode(dvp);
 2437                 }
 2438                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 2439         }
 2440 
 2441         /*
 2442          * If the entry is "negative", we place it into the
 2443          * "negative" cache queue, otherwise, we place it into the
 2444          * destination vnode's cache entries queue.
 2445          */
 2446         if (vp != NULL) {
 2447                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 2448                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 2449                     vp);
 2450         } else {
 2451                 if (cnp->cn_flags & ISWHITEOUT)
 2452                         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
 2453                 cache_neg_insert(ncp);
 2454                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 2455                     ncp->nc_name);
 2456         }
 2457 
 2458         /*
 2459          * Insert the new namecache entry into the appropriate chain
 2460          * within the cache entries table.
 2461          */
 2462         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 2463 
 2464         atomic_thread_fence_rel();
 2465         /*
 2466          * Mark the entry as fully constructed.
 2467          * It is immutable past this point until its removal.
 2468          */
 2469         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
 2470 
 2471         cache_enter_unlock(&cel);
 2472         if (ndd != NULL)
 2473                 cache_free(ndd);
 2474         return;
 2475 out_unlock_free:
 2476         cache_enter_unlock(&cel);
 2477         cache_free(ncp);
 2478         return;
 2479 }
 2480 
 2481 static u_int
 2482 cache_roundup_2(u_int val)
 2483 {
 2484         u_int res;
 2485 
 2486         for (res = 1; res <= val; res <<= 1)
 2487                 continue;
 2488 
 2489         return (res);
 2490 }
 2491 
 2492 static struct nchashhead *
 2493 nchinittbl(u_long elements, u_long *hashmask)
 2494 {
 2495         struct nchashhead *hashtbl;
 2496         u_long hashsize, i;
 2497 
 2498         hashsize = cache_roundup_2(elements) / 2;
 2499 
 2500         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
 2501         for (i = 0; i < hashsize; i++)
 2502                 CK_SLIST_INIT(&hashtbl[i]);
 2503         *hashmask = hashsize - 1;
 2504         return (hashtbl);
 2505 }
 2506 
 2507 static void
 2508 ncfreetbl(struct nchashhead *hashtbl)
 2509 {
 2510 
 2511         free(hashtbl, M_VFSCACHE);
 2512 }
 2513 
 2514 /*
 2515  * Name cache initialization, from vfs_init() when we are booting
 2516  */
 2517 static void
 2518 nchinit(void *dummy __unused)
 2519 {
 2520         u_int i;
 2521 
 2522         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
 2523             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2524         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
 2525             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2526         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
 2527             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2528         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
 2529             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2530 
 2531         VFS_SMR_ZONE_SET(cache_zone_small);
 2532         VFS_SMR_ZONE_SET(cache_zone_small_ts);
 2533         VFS_SMR_ZONE_SET(cache_zone_large);
 2534         VFS_SMR_ZONE_SET(cache_zone_large_ts);
 2535 
 2536         ncsize = desiredvnodes * ncsizefactor;
 2537         cache_recalc_neg_min(ncnegminpct);
 2538         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
 2539         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
 2540         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
 2541                 ncbuckethash = 7;
 2542         if (ncbuckethash > nchash)
 2543                 ncbuckethash = nchash;
 2544         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 2545             M_WAITOK | M_ZERO);
 2546         for (i = 0; i < numbucketlocks; i++)
 2547                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
 2548         ncvnodehash = ncbuckethash;
 2549         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 2550             M_WAITOK | M_ZERO);
 2551         for (i = 0; i < numvnodelocks; i++)
 2552                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 2553 
 2554         for (i = 0; i < numneglists; i++) {
 2555                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
 2556                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 2557                 TAILQ_INIT(&neglists[i].nl_list);
 2558                 TAILQ_INIT(&neglists[i].nl_hotlist);
 2559         }
 2560 }
 2561 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 2562 
 2563 void
 2564 cache_vnode_init(struct vnode *vp)
 2565 {
 2566 
 2567         LIST_INIT(&vp->v_cache_src);
 2568         TAILQ_INIT(&vp->v_cache_dst);
 2569         vp->v_cache_dd = NULL;
 2570         cache_prehash(vp);
 2571 }
 2572 
 2573 void
 2574 cache_changesize(u_long newmaxvnodes)
 2575 {
 2576         struct nchashhead *new_nchashtbl, *old_nchashtbl;
 2577         u_long new_nchash, old_nchash;
 2578         struct namecache *ncp;
 2579         uint32_t hash;
 2580         u_long newncsize;
 2581         int i;
 2582 
 2583         newncsize = newmaxvnodes * ncsizefactor;
 2584         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 2585         if (newmaxvnodes < numbucketlocks)
 2586                 newmaxvnodes = numbucketlocks;
 2587 
 2588         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
 2589         /* If same hash table size, nothing to do */
 2590         if (nchash == new_nchash) {
 2591                 ncfreetbl(new_nchashtbl);
 2592                 return;
 2593         }
 2594         /*
 2595          * Move everything from the old hash table to the new table.
 2596          * None of the namecache entries in the table can be removed
 2597          * because to do so, they have to be removed from the hash table.
 2598          */
 2599         cache_lock_all_vnodes();
 2600         cache_lock_all_buckets();
 2601         old_nchashtbl = nchashtbl;
 2602         old_nchash = nchash;
 2603         nchashtbl = new_nchashtbl;
 2604         nchash = new_nchash;
 2605         for (i = 0; i <= old_nchash; i++) {
 2606                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
 2607                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 2608                             ncp->nc_dvp);
 2609                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
 2610                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 2611                 }
 2612         }
 2613         ncsize = newncsize;
 2614         cache_recalc_neg_min(ncnegminpct);
 2615         cache_unlock_all_buckets();
 2616         cache_unlock_all_vnodes();
 2617         ncfreetbl(old_nchashtbl);
 2618 }
 2619 
 2620 /*
 2621  * Remove all entries from and to a particular vnode.
 2622  */
 2623 static void
 2624 cache_purge_impl(struct vnode *vp)
 2625 {
 2626         struct cache_freebatch batch;
 2627         struct namecache *ncp;
 2628         struct mtx *vlp, *vlp2;
 2629 
 2630         TAILQ_INIT(&batch);
 2631         vlp = VP2VNODELOCK(vp);
 2632         vlp2 = NULL;
 2633         mtx_lock(vlp);
 2634 retry:
 2635         while (!LIST_EMPTY(&vp->v_cache_src)) {
 2636                 ncp = LIST_FIRST(&vp->v_cache_src);
 2637                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2638                         goto retry;
 2639                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2640         }
 2641         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 2642                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 2643                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2644                         goto retry;
 2645                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2646         }
 2647         ncp = vp->v_cache_dd;
 2648         if (ncp != NULL) {
 2649                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 2650                    ("lost dotdot link"));
 2651                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2652                         goto retry;
 2653                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2654         }
 2655         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 2656         mtx_unlock(vlp);
 2657         if (vlp2 != NULL)
 2658                 mtx_unlock(vlp2);
 2659         cache_free_batch(&batch);
 2660 }
 2661 
 2662 /*
 2663  * Opportunistic check to see if there is anything to do.
 2664  */
 2665 static bool
 2666 cache_has_entries(struct vnode *vp)
 2667 {
 2668 
 2669         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 2670             atomic_load_ptr(&vp->v_cache_dd) == NULL)
 2671                 return (false);
 2672         return (true);
 2673 }
 2674 
 2675 void
 2676 cache_purge(struct vnode *vp)
 2677 {
 2678 
 2679         SDT_PROBE1(vfs, namecache, purge, done, vp);
 2680         if (!cache_has_entries(vp))
 2681                 return;
 2682         cache_purge_impl(vp);
 2683 }
 2684 
 2685 /*
 2686  * Only to be used by vgone.
 2687  */
 2688 void
 2689 cache_purge_vgone(struct vnode *vp)
 2690 {
 2691         struct mtx *vlp;
 2692 
 2693         VNPASS(VN_IS_DOOMED(vp), vp);
 2694         if (cache_has_entries(vp)) {
 2695                 cache_purge_impl(vp);
 2696                 return;
 2697         }
 2698 
 2699         /*
 2700          * Serialize against a potential thread doing cache_purge.
 2701          */
 2702         vlp = VP2VNODELOCK(vp);
 2703         mtx_wait_unlocked(vlp);
 2704         if (cache_has_entries(vp)) {
 2705                 cache_purge_impl(vp);
 2706                 return;
 2707         }
 2708         return;
 2709 }
 2710 
 2711 /*
 2712  * Remove all negative entries for a particular directory vnode.
 2713  */
 2714 void
 2715 cache_purge_negative(struct vnode *vp)
 2716 {
 2717         struct cache_freebatch batch;
 2718         struct namecache *ncp, *nnp;
 2719         struct mtx *vlp;
 2720 
 2721         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 2722         if (LIST_EMPTY(&vp->v_cache_src))
 2723                 return;
 2724         TAILQ_INIT(&batch);
 2725         vlp = VP2VNODELOCK(vp);
 2726         mtx_lock(vlp);
 2727         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 2728                 if (!(ncp->nc_flag & NCF_NEGATIVE))
 2729                         continue;
 2730                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 2731                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2732         }
 2733         mtx_unlock(vlp);
 2734         cache_free_batch(&batch);
 2735 }
 2736 
 2737 /*
 2738  * Entry points for modifying VOP operations.
 2739  */
 2740 void
 2741 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
 2742     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
 2743 {
 2744 
 2745         ASSERT_VOP_IN_SEQC(fdvp);
 2746         ASSERT_VOP_IN_SEQC(fvp);
 2747         ASSERT_VOP_IN_SEQC(tdvp);
 2748         if (tvp != NULL)
 2749                 ASSERT_VOP_IN_SEQC(tvp);
 2750 
 2751         cache_purge(fvp);
 2752         if (tvp != NULL) {
 2753                 cache_purge(tvp);
 2754                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
 2755                     ("%s: lingering negative entry", __func__));
 2756         } else {
 2757                 cache_remove_cnp(tdvp, tcnp);
 2758         }
 2759 
 2760         /*
 2761          * TODO
 2762          *
 2763          * Historically renaming was always purging all revelang entries,
 2764          * but that's quite wasteful. In particular turns out that in many cases
 2765          * the target file is immediately accessed after rename, inducing a cache
 2766          * miss.
 2767          *
 2768          * Recode this to reduce relocking and reuse the existing entry (if any)
 2769          * instead of just removing it above and allocating a new one here.
 2770          */
 2771         if (cache_rename_add) {
 2772                 cache_enter(tdvp, fvp, tcnp);
 2773         }
 2774 }
 2775 
 2776 void
 2777 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
 2778 {
 2779 
 2780         ASSERT_VOP_IN_SEQC(dvp);
 2781         ASSERT_VOP_IN_SEQC(vp);
 2782         cache_purge(vp);
 2783 }
 2784 
 2785 #ifdef INVARIANTS
 2786 /*
 2787  * Validate that if an entry exists it matches.
 2788  */
 2789 void
 2790 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 2791 {
 2792         struct namecache *ncp;
 2793         struct mtx *blp;
 2794         uint32_t hash;
 2795 
 2796         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 2797         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 2798                 return;
 2799         blp = HASH2BUCKETLOCK(hash);
 2800         mtx_lock(blp);
 2801         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 2802                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 2803                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
 2804                         if (ncp->nc_vp != vp)
 2805                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
 2806                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
 2807                                     ncp->nc_vp);
 2808                 }
 2809         }
 2810         mtx_unlock(blp);
 2811 }
 2812 #endif
 2813 
 2814 /*
 2815  * Flush all entries referencing a particular filesystem.
 2816  */
 2817 void
 2818 cache_purgevfs(struct mount *mp)
 2819 {
 2820         struct vnode *vp, *mvp;
 2821 
 2822         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 2823         /*
 2824          * Somewhat wasteful iteration over all vnodes. Would be better to
 2825          * support filtering and avoid the interlock to begin with.
 2826          */
 2827         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 2828                 if (!cache_has_entries(vp)) {
 2829                         VI_UNLOCK(vp);
 2830                         continue;
 2831                 }
 2832                 vholdl(vp);
 2833                 VI_UNLOCK(vp);
 2834                 cache_purge(vp);
 2835                 vdrop(vp);
 2836         }
 2837 }
 2838 
 2839 /*
 2840  * Perform canonical checks and cache lookup and pass on to filesystem
 2841  * through the vop_cachedlookup only if needed.
 2842  */
 2843 
 2844 int
 2845 vfs_cache_lookup(struct vop_lookup_args *ap)
 2846 {
 2847         struct vnode *dvp;
 2848         int error;
 2849         struct vnode **vpp = ap->a_vpp;
 2850         struct componentname *cnp = ap->a_cnp;
 2851         int flags = cnp->cn_flags;
 2852 
 2853         *vpp = NULL;
 2854         dvp = ap->a_dvp;
 2855 
 2856         if (dvp->v_type != VDIR)
 2857                 return (ENOTDIR);
 2858 
 2859         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 2860             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 2861                 return (EROFS);
 2862 
 2863         error = vn_dir_check_exec(dvp, cnp);
 2864         if (error != 0)
 2865                 return (error);
 2866 
 2867         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 2868         if (error == 0)
 2869                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 2870         if (error == -1)
 2871                 return (0);
 2872         return (error);
 2873 }
 2874 
 2875 /* Implementation of the getcwd syscall. */
 2876 int
 2877 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 2878 {
 2879         char *buf, *retbuf;
 2880         size_t buflen;
 2881         int error;
 2882 
 2883         buflen = uap->buflen;
 2884         if (__predict_false(buflen < 2))
 2885                 return (EINVAL);
 2886         if (buflen > MAXPATHLEN)
 2887                 buflen = MAXPATHLEN;
 2888 
 2889         buf = uma_zalloc(namei_zone, M_WAITOK);
 2890         error = vn_getcwd(buf, &retbuf, &buflen);
 2891         if (error == 0)
 2892                 error = copyout(retbuf, uap->buf, buflen);
 2893         uma_zfree(namei_zone, buf);
 2894         return (error);
 2895 }
 2896 
 2897 int
 2898 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
 2899 {
 2900         struct pwd *pwd;
 2901         int error;
 2902 
 2903         vfs_smr_enter();
 2904         pwd = pwd_get_smr();
 2905         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
 2906             buflen, 0);
 2907         VFS_SMR_ASSERT_NOT_ENTERED();
 2908         if (error < 0) {
 2909                 pwd = pwd_hold(curthread);
 2910                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
 2911                     retbuf, buflen);
 2912                 pwd_drop(pwd);
 2913         }
 2914 
 2915 #ifdef KTRACE
 2916         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
 2917                 ktrnamei(*retbuf);
 2918 #endif
 2919         return (error);
 2920 }
 2921 
 2922 static int
 2923 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
 2924     size_t size, int flags, enum uio_seg pathseg)
 2925 {
 2926         struct nameidata nd;
 2927         char *retbuf, *freebuf;
 2928         int error;
 2929 
 2930         if (flags != 0)
 2931                 return (EINVAL);
 2932         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
 2933             pathseg, path, fd, &cap_fstat_rights, td);
 2934         if ((error = namei(&nd)) != 0)
 2935                 return (error);
 2936         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
 2937         if (error == 0) {
 2938                 error = copyout(retbuf, buf, size);
 2939                 free(freebuf, M_TEMP);
 2940         }
 2941         NDFREE(&nd, 0);
 2942         return (error);
 2943 }
 2944 
 2945 int
 2946 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
 2947 {
 2948 
 2949         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
 2950             uap->flags, UIO_USERSPACE));
 2951 }
 2952 
 2953 /*
 2954  * Retrieve the full filesystem path that correspond to a vnode from the name
 2955  * cache (if available)
 2956  */
 2957 int
 2958 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
 2959 {
 2960         struct pwd *pwd;
 2961         char *buf;
 2962         size_t buflen;
 2963         int error;
 2964 
 2965         if (__predict_false(vp == NULL))
 2966                 return (EINVAL);
 2967 
 2968         buflen = MAXPATHLEN;
 2969         buf = malloc(buflen, M_TEMP, M_WAITOK);
 2970         vfs_smr_enter();
 2971         pwd = pwd_get_smr();
 2972         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
 2973         VFS_SMR_ASSERT_NOT_ENTERED();
 2974         if (error < 0) {
 2975                 pwd = pwd_hold(curthread);
 2976                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
 2977                 pwd_drop(pwd);
 2978         }
 2979         if (error == 0)
 2980                 *freebuf = buf;
 2981         else
 2982                 free(buf, M_TEMP);
 2983         return (error);
 2984 }
 2985 
 2986 /*
 2987  * This function is similar to vn_fullpath, but it attempts to lookup the
 2988  * pathname relative to the global root mount point.  This is required for the
 2989  * auditing sub-system, as audited pathnames must be absolute, relative to the
 2990  * global root mount point.
 2991  */
 2992 int
 2993 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
 2994 {
 2995         char *buf;
 2996         size_t buflen;
 2997         int error;
 2998 
 2999         if (__predict_false(vp == NULL))
 3000                 return (EINVAL);
 3001         buflen = MAXPATHLEN;
 3002         buf = malloc(buflen, M_TEMP, M_WAITOK);
 3003         vfs_smr_enter();
 3004         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
 3005         VFS_SMR_ASSERT_NOT_ENTERED();
 3006         if (error < 0) {
 3007                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
 3008         }
 3009         if (error == 0)
 3010                 *freebuf = buf;
 3011         else
 3012                 free(buf, M_TEMP);
 3013         return (error);
 3014 }
 3015 
 3016 static struct namecache *
 3017 vn_dd_from_dst(struct vnode *vp)
 3018 {
 3019         struct namecache *ncp;
 3020 
 3021         cache_assert_vnode_locked(vp);
 3022         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
 3023                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3024                         return (ncp);
 3025         }
 3026         return (NULL);
 3027 }
 3028 
 3029 int
 3030 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
 3031 {
 3032         struct vnode *dvp;
 3033         struct namecache *ncp;
 3034         struct mtx *vlp;
 3035         int error;
 3036 
 3037         vlp = VP2VNODELOCK(*vp);
 3038         mtx_lock(vlp);
 3039         ncp = (*vp)->v_cache_dd;
 3040         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
 3041                 KASSERT(ncp == vn_dd_from_dst(*vp),
 3042                     ("%s: mismatch for dd entry (%p != %p)", __func__,
 3043                     ncp, vn_dd_from_dst(*vp)));
 3044         } else {
 3045                 ncp = vn_dd_from_dst(*vp);
 3046         }
 3047         if (ncp != NULL) {
 3048                 if (*buflen < ncp->nc_nlen) {
 3049                         mtx_unlock(vlp);
 3050                         vrele(*vp);
 3051                         counter_u64_add(numfullpathfail4, 1);
 3052                         error = ENOMEM;
 3053                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3054                             vp, NULL);
 3055                         return (error);
 3056                 }
 3057                 *buflen -= ncp->nc_nlen;
 3058                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3059                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 3060                     ncp->nc_name, vp);
 3061                 dvp = *vp;
 3062                 *vp = ncp->nc_dvp;
 3063                 vref(*vp);
 3064                 mtx_unlock(vlp);
 3065                 vrele(dvp);
 3066                 return (0);
 3067         }
 3068         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 3069 
 3070         mtx_unlock(vlp);
 3071         vn_lock(*vp, LK_SHARED | LK_RETRY);
 3072         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
 3073         vput(*vp);
 3074         if (error) {
 3075                 counter_u64_add(numfullpathfail2, 1);
 3076                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 3077                 return (error);
 3078         }
 3079 
 3080         *vp = dvp;
 3081         if (VN_IS_DOOMED(dvp)) {
 3082                 /* forced unmount */
 3083                 vrele(dvp);
 3084                 error = ENOENT;
 3085                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 3086                 return (error);
 3087         }
 3088         /*
 3089          * *vp has its use count incremented still.
 3090          */
 3091 
 3092         return (0);
 3093 }
 3094 
 3095 /*
 3096  * Resolve a directory to a pathname.
 3097  *
 3098  * The name of the directory can always be found in the namecache or fetched
 3099  * from the filesystem. There is also guaranteed to be only one parent, meaning
 3100  * we can just follow vnodes up until we find the root.
 3101  *
 3102  * The vnode must be referenced.
 3103  */
 3104 static int
 3105 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3106     size_t *len, size_t addend)
 3107 {
 3108 #ifdef KDTRACE_HOOKS
 3109         struct vnode *startvp = vp;
 3110 #endif
 3111         struct vnode *vp1;
 3112         size_t buflen;
 3113         int error;
 3114         bool slash_prefixed;
 3115 
 3116         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
 3117         VNPASS(vp->v_usecount > 0, vp);
 3118 
 3119         buflen = *len;
 3120 
 3121         slash_prefixed = true;
 3122         if (addend == 0) {
 3123                 MPASS(*len >= 2);
 3124                 buflen--;
 3125                 buf[buflen] = '\0';
 3126                 slash_prefixed = false;
 3127         }
 3128 
 3129         error = 0;
 3130 
 3131         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 3132         counter_u64_add(numfullpathcalls, 1);
 3133         while (vp != rdir && vp != rootvnode) {
 3134                 /*
 3135                  * The vp vnode must be already fully constructed,
 3136                  * since it is either found in namecache or obtained
 3137                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 3138                  * without obtaining the vnode lock.
 3139                  */
 3140                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3141                         vn_lock(vp, LK_RETRY | LK_SHARED);
 3142 
 3143                         /*
 3144                          * With the vnode locked, check for races with
 3145                          * unmount, forced or not.  Note that we
 3146                          * already verified that vp is not equal to
 3147                          * the root vnode, which means that
 3148                          * mnt_vnodecovered can be NULL only for the
 3149                          * case of unmount.
 3150                          */
 3151                         if (VN_IS_DOOMED(vp) ||
 3152                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 3153                             vp1->v_mountedhere != vp->v_mount) {
 3154                                 vput(vp);
 3155                                 error = ENOENT;
 3156                                 SDT_PROBE3(vfs, namecache, fullpath, return,
 3157                                     error, vp, NULL);
 3158                                 break;
 3159                         }
 3160 
 3161                         vref(vp1);
 3162                         vput(vp);
 3163                         vp = vp1;
 3164                         continue;
 3165                 }
 3166                 if (vp->v_type != VDIR) {
 3167                         vrele(vp);
 3168                         counter_u64_add(numfullpathfail1, 1);
 3169                         error = ENOTDIR;
 3170                         SDT_PROBE3(vfs, namecache, fullpath, return,
 3171                             error, vp, NULL);
 3172                         break;
 3173                 }
 3174                 error = vn_vptocnp(&vp, buf, &buflen);
 3175                 if (error)
 3176                         break;
 3177                 if (buflen == 0) {
 3178                         vrele(vp);
 3179                         error = ENOMEM;
 3180                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3181                             startvp, NULL);
 3182                         break;
 3183                 }
 3184                 buf[--buflen] = '/';
 3185                 slash_prefixed = true;
 3186         }
 3187         if (error)
 3188                 return (error);
 3189         if (!slash_prefixed) {
 3190                 if (buflen == 0) {
 3191                         vrele(vp);
 3192                         counter_u64_add(numfullpathfail4, 1);
 3193                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 3194                             startvp, NULL);
 3195                         return (ENOMEM);
 3196                 }
 3197                 buf[--buflen] = '/';
 3198         }
 3199         counter_u64_add(numfullpathfound, 1);
 3200         vrele(vp);
 3201 
 3202         *retbuf = buf + buflen;
 3203         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
 3204         *len -= buflen;
 3205         *len += addend;
 3206         return (0);
 3207 }
 3208 
 3209 /*
 3210  * Resolve an arbitrary vnode to a pathname.
 3211  *
 3212  * Note 2 caveats:
 3213  * - hardlinks are not tracked, thus if the vnode is not a directory this can
 3214  *   resolve to a different path than the one used to find it
 3215  * - namecache is not mandatory, meaning names are not guaranteed to be added
 3216  *   (in which case resolving fails)
 3217  */
 3218 static void __inline
 3219 cache_rev_failed_impl(int *reason, int line)
 3220 {
 3221 
 3222         *reason = line;
 3223 }
 3224 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
 3225 
 3226 static int
 3227 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 3228     char **retbuf, size_t *buflen, size_t addend)
 3229 {
 3230 #ifdef KDTRACE_HOOKS
 3231         struct vnode *startvp = vp;
 3232 #endif
 3233         struct vnode *tvp;
 3234         struct mount *mp;
 3235         struct namecache *ncp;
 3236         size_t orig_buflen;
 3237         int reason;
 3238         int error;
 3239 #ifdef KDTRACE_HOOKS
 3240         int i;
 3241 #endif
 3242         seqc_t vp_seqc, tvp_seqc;
 3243         u_char nc_flag;
 3244 
 3245         VFS_SMR_ASSERT_ENTERED();
 3246 
 3247         if (!cache_fast_revlookup) {
 3248                 vfs_smr_exit();
 3249                 return (-1);
 3250         }
 3251 
 3252         orig_buflen = *buflen;
 3253 
 3254         if (addend == 0) {
 3255                 MPASS(*buflen >= 2);
 3256                 *buflen -= 1;
 3257                 buf[*buflen] = '\0';
 3258         }
 3259 
 3260         if (vp == rdir || vp == rootvnode) {
 3261                 if (addend == 0) {
 3262                         *buflen -= 1;
 3263                         buf[*buflen] = '/';
 3264                 }
 3265                 goto out_ok;
 3266         }
 3267 
 3268 #ifdef KDTRACE_HOOKS
 3269         i = 0;
 3270 #endif
 3271         error = -1;
 3272         ncp = NULL; /* for sdt probe down below */
 3273         vp_seqc = vn_seqc_read_any(vp);
 3274         if (seqc_in_modify(vp_seqc)) {
 3275                 cache_rev_failed(&reason);
 3276                 goto out_abort;
 3277         }
 3278 
 3279         for (;;) {
 3280 #ifdef KDTRACE_HOOKS
 3281                 i++;
 3282 #endif
 3283                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3284                         mp = atomic_load_ptr(&vp->v_mount);
 3285                         if (mp == NULL) {
 3286                                 cache_rev_failed(&reason);
 3287                                 goto out_abort;
 3288                         }
 3289                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
 3290                         tvp_seqc = vn_seqc_read_any(tvp);
 3291                         if (seqc_in_modify(tvp_seqc)) {
 3292                                 cache_rev_failed(&reason);
 3293                                 goto out_abort;
 3294                         }
 3295                         if (!vn_seqc_consistent(vp, vp_seqc)) {
 3296                                 cache_rev_failed(&reason);
 3297                                 goto out_abort;
 3298                         }
 3299                         vp = tvp;
 3300                         vp_seqc = tvp_seqc;
 3301                         continue;
 3302                 }
 3303                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 3304                 if (ncp == NULL) {
 3305                         cache_rev_failed(&reason);
 3306                         goto out_abort;
 3307                 }
 3308                 nc_flag = atomic_load_char(&ncp->nc_flag);
 3309                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
 3310                         cache_rev_failed(&reason);
 3311                         goto out_abort;
 3312                 }
 3313                 if (ncp->nc_nlen >= *buflen) {
 3314                         cache_rev_failed(&reason);
 3315                         error = ENOMEM;
 3316                         goto out_abort;
 3317                 }
 3318                 *buflen -= ncp->nc_nlen;
 3319                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3320                 *buflen -= 1;
 3321                 buf[*buflen] = '/';
 3322                 tvp = ncp->nc_dvp;
 3323                 tvp_seqc = vn_seqc_read_any(tvp);
 3324                 if (seqc_in_modify(tvp_seqc)) {
 3325                         cache_rev_failed(&reason);
 3326                         goto out_abort;
 3327                 }
 3328                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 3329                         cache_rev_failed(&reason);
 3330                         goto out_abort;
 3331                 }
 3332                 /*
 3333                  * Acquire fence provided by vn_seqc_read_any above.
 3334                  */
 3335                 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
 3336                         cache_rev_failed(&reason);
 3337                         goto out_abort;
 3338                 }
 3339                 if (!cache_ncp_canuse(ncp)) {
 3340                         cache_rev_failed(&reason);
 3341                         goto out_abort;
 3342                 }
 3343                 vp = tvp;
 3344                 vp_seqc = tvp_seqc;
 3345                 if (vp == rdir || vp == rootvnode)
 3346                         break;
 3347         }
 3348 out_ok:
 3349         vfs_smr_exit();
 3350         *retbuf = buf + *buflen;
 3351         *buflen = orig_buflen - *buflen + addend;
 3352         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
 3353         return (0);
 3354 
 3355 out_abort:
 3356         *buflen = orig_buflen;
 3357         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
 3358         vfs_smr_exit();
 3359         return (error);
 3360 }
 3361 
 3362 static int
 3363 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3364     size_t *buflen)
 3365 {
 3366         size_t orig_buflen, addend;
 3367         int error;
 3368 
 3369         if (*buflen < 2)
 3370                 return (EINVAL);
 3371 
 3372         orig_buflen = *buflen;
 3373 
 3374         vref(vp);
 3375         addend = 0;
 3376         if (vp->v_type != VDIR) {
 3377                 *buflen -= 1;
 3378                 buf[*buflen] = '\0';
 3379                 error = vn_vptocnp(&vp, buf, buflen);
 3380                 if (error)
 3381                         return (error);
 3382                 if (*buflen == 0) {
 3383                         vrele(vp);
 3384                         return (ENOMEM);
 3385                 }
 3386                 *buflen -= 1;
 3387                 buf[*buflen] = '/';
 3388                 addend = orig_buflen - *buflen;
 3389         }
 3390 
 3391         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
 3392 }
 3393 
 3394 /*
 3395  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
 3396  *
 3397  * Since the namecache does not track hardlinks, the caller is expected to first
 3398  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
 3399  *
 3400  * Then we have 2 cases:
 3401  * - if the found vnode is a directory, the path can be constructed just by
 3402  *   following names up the chain
 3403  * - otherwise we populate the buffer with the saved name and start resolving
 3404  *   from the parent
 3405  */
 3406 static int
 3407 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
 3408     size_t *buflen)
 3409 {
 3410         char *buf, *tmpbuf;
 3411         struct pwd *pwd;
 3412         struct componentname *cnp;
 3413         struct vnode *vp;
 3414         size_t addend;
 3415         int error;
 3416         enum vtype type;
 3417 
 3418         if (*buflen < 2)
 3419                 return (EINVAL);
 3420         if (*buflen > MAXPATHLEN)
 3421                 *buflen = MAXPATHLEN;
 3422 
 3423         buf = malloc(*buflen, M_TEMP, M_WAITOK);
 3424 
 3425         addend = 0;
 3426         vp = ndp->ni_vp;
 3427         /*
 3428          * Check for VBAD to work around the vp_crossmp bug in lookup().
 3429          *
 3430          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
 3431          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
 3432          * If the type is VDIR (like in this very case) we can skip looking
 3433          * at ni_dvp in the first place. However, since vnodes get passed here
 3434          * unlocked the target may transition to doomed state (type == VBAD)
 3435          * before we get to evaluate the condition. If this happens, we will
 3436          * populate part of the buffer and descend to vn_fullpath_dir with
 3437          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
 3438          *
 3439          * This should be atomic_load(&vp->v_type) but it is illegal to take
 3440          * an address of a bit field, even if said field is sized to char.
 3441          * Work around the problem by reading the value into a full-sized enum
 3442          * and then re-reading it with atomic_load which will still prevent
 3443          * the compiler from re-reading down the road.
 3444          */
 3445         type = vp->v_type;
 3446         type = atomic_load_int(&type);
 3447         if (type == VBAD) {
 3448                 error = ENOENT;
 3449                 goto out_bad;
 3450         }
 3451         if (type != VDIR) {
 3452                 cnp = &ndp->ni_cnd;
 3453                 addend = cnp->cn_namelen + 2;
 3454                 if (*buflen < addend) {
 3455                         error = ENOMEM;
 3456                         goto out_bad;
 3457                 }
 3458                 *buflen -= addend;
 3459                 tmpbuf = buf + *buflen;
 3460                 tmpbuf[0] = '/';
 3461                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
 3462                 tmpbuf[addend - 1] = '\0';
 3463                 vp = ndp->ni_dvp;
 3464         }
 3465 
 3466         vfs_smr_enter();
 3467         pwd = pwd_get_smr();
 3468         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3469             addend);
 3470         VFS_SMR_ASSERT_NOT_ENTERED();
 3471         if (error < 0) {
 3472                 pwd = pwd_hold(curthread);
 3473                 vref(vp);
 3474                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3475                     addend);
 3476                 pwd_drop(pwd);
 3477                 if (error != 0)
 3478                         goto out_bad;
 3479         }
 3480 
 3481         *freebuf = buf;
 3482 
 3483         return (0);
 3484 out_bad:
 3485         free(buf, M_TEMP);
 3486         return (error);
 3487 }
 3488 
 3489 struct vnode *
 3490 vn_dir_dd_ino(struct vnode *vp)
 3491 {
 3492         struct namecache *ncp;
 3493         struct vnode *ddvp;
 3494         struct mtx *vlp;
 3495         enum vgetstate vs;
 3496 
 3497         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 3498         vlp = VP2VNODELOCK(vp);
 3499         mtx_lock(vlp);
 3500         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 3501                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 3502                         continue;
 3503                 ddvp = ncp->nc_dvp;
 3504                 vs = vget_prep(ddvp);
 3505                 mtx_unlock(vlp);
 3506                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
 3507                         return (NULL);
 3508                 return (ddvp);
 3509         }
 3510         mtx_unlock(vlp);
 3511         return (NULL);
 3512 }
 3513 
 3514 int
 3515 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 3516 {
 3517         struct namecache *ncp;
 3518         struct mtx *vlp;
 3519         int l;
 3520 
 3521         vlp = VP2VNODELOCK(vp);
 3522         mtx_lock(vlp);
 3523         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 3524                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3525                         break;
 3526         if (ncp == NULL) {
 3527                 mtx_unlock(vlp);
 3528                 return (ENOENT);
 3529         }
 3530         l = min(ncp->nc_nlen, buflen - 1);
 3531         memcpy(buf, ncp->nc_name, l);
 3532         mtx_unlock(vlp);
 3533         buf[l] = '\0';
 3534         return (0);
 3535 }
 3536 
 3537 /*
 3538  * This function updates path string to vnode's full global path
 3539  * and checks the size of the new path string against the pathlen argument.
 3540  *
 3541  * Requires a locked, referenced vnode.
 3542  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 3543  *
 3544  * If vp is a directory, the call to vn_fullpath_global() always succeeds
 3545  * because it falls back to the ".." lookup if the namecache lookup fails.
 3546  */
 3547 int
 3548 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
 3549     u_int pathlen)
 3550 {
 3551         struct nameidata nd;
 3552         struct vnode *vp1;
 3553         char *rpath, *fbuf;
 3554         int error;
 3555 
 3556         ASSERT_VOP_ELOCKED(vp, __func__);
 3557 
 3558         /* Construct global filesystem path from vp. */
 3559         VOP_UNLOCK(vp);
 3560         error = vn_fullpath_global(vp, &rpath, &fbuf);
 3561 
 3562         if (error != 0) {
 3563                 vrele(vp);
 3564                 return (error);
 3565         }
 3566 
 3567         if (strlen(rpath) >= pathlen) {
 3568                 vrele(vp);
 3569                 error = ENAMETOOLONG;
 3570                 goto out;
 3571         }
 3572 
 3573         /*
 3574          * Re-lookup the vnode by path to detect a possible rename.
 3575          * As a side effect, the vnode is relocked.
 3576          * If vnode was renamed, return ENOENT.
 3577          */
 3578         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 3579             UIO_SYSSPACE, path, td);
 3580         error = namei(&nd);
 3581         if (error != 0) {
 3582                 vrele(vp);
 3583                 goto out;
 3584         }
 3585         NDFREE(&nd, NDF_ONLY_PNBUF);
 3586         vp1 = nd.ni_vp;
 3587         vrele(vp);
 3588         if (vp1 == vp)
 3589                 strcpy(path, rpath);
 3590         else {
 3591                 vput(vp1);
 3592                 error = ENOENT;
 3593         }
 3594 
 3595 out:
 3596         free(fbuf, M_TEMP);
 3597         return (error);
 3598 }
 3599 
 3600 #ifdef DDB
 3601 static void
 3602 db_print_vpath(struct vnode *vp)
 3603 {
 3604 
 3605         while (vp != NULL) {
 3606                 db_printf("%p: ", vp);
 3607                 if (vp == rootvnode) {
 3608                         db_printf("/");
 3609                         vp = NULL;
 3610                 } else {
 3611                         if (vp->v_vflag & VV_ROOT) {
 3612                                 db_printf("<mount point>");
 3613                                 vp = vp->v_mount->mnt_vnodecovered;
 3614                         } else {
 3615                                 struct namecache *ncp;
 3616                                 char *ncn;
 3617                                 int i;
 3618 
 3619                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 3620                                 if (ncp != NULL) {
 3621                                         ncn = ncp->nc_name;
 3622                                         for (i = 0; i < ncp->nc_nlen; i++)
 3623                                                 db_printf("%c", *ncn++);
 3624                                         vp = ncp->nc_dvp;
 3625                                 } else {
 3626                                         vp = NULL;
 3627                                 }
 3628                         }
 3629                 }
 3630                 db_printf("\n");
 3631         }
 3632 
 3633         return;
 3634 }
 3635 
 3636 DB_SHOW_COMMAND(vpath, db_show_vpath)
 3637 {
 3638         struct vnode *vp;
 3639 
 3640         if (!have_addr) {
 3641                 db_printf("usage: show vpath <struct vnode *>\n");
 3642                 return;
 3643         }
 3644 
 3645         vp = (struct vnode *)addr;
 3646         db_print_vpath(vp);
 3647 }
 3648 
 3649 #endif
 3650 
 3651 static int cache_fast_lookup = 1;
 3652 static char __read_frequently cache_fast_lookup_enabled = true;
 3653 
 3654 #define CACHE_FPL_FAILED        -2020
 3655 
 3656 void
 3657 cache_fast_lookup_enabled_recalc(void)
 3658 {
 3659         int lookup_flag;
 3660         int mac_on;
 3661 
 3662 #ifdef MAC
 3663         mac_on = mac_vnode_check_lookup_enabled();
 3664         mac_on |= mac_vnode_check_readlink_enabled();
 3665 #else
 3666         mac_on = 0;
 3667 #endif
 3668 
 3669         lookup_flag = atomic_load_int(&cache_fast_lookup);
 3670         if (lookup_flag && !mac_on) {
 3671                 atomic_store_char(&cache_fast_lookup_enabled, true);
 3672         } else {
 3673                 atomic_store_char(&cache_fast_lookup_enabled, false);
 3674         }
 3675 }
 3676 
 3677 static int
 3678 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
 3679 {
 3680         int error, old;
 3681 
 3682         old = atomic_load_int(&cache_fast_lookup);
 3683         error = sysctl_handle_int(oidp, arg1, arg2, req);
 3684         if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
 3685                 cache_fast_lookup_enabled_recalc();
 3686         return (error);
 3687 }
 3688 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
 3689     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
 3690 
 3691 /*
 3692  * Components of nameidata (or objects it can point to) which may
 3693  * need restoring in case fast path lookup fails.
 3694  */
 3695 struct nameidata_outer {
 3696         size_t ni_pathlen;
 3697         int cn_flags;
 3698 };
 3699 
 3700 struct nameidata_saved {
 3701 #ifdef INVARIANTS
 3702         char *cn_nameptr;
 3703         size_t ni_pathlen;
 3704 #endif
 3705 };
 3706 
 3707 #ifdef INVARIANTS
 3708 struct cache_fpl_debug {
 3709         size_t ni_pathlen;
 3710 };
 3711 #endif
 3712 
 3713 struct cache_fpl {
 3714         struct nameidata *ndp;
 3715         struct componentname *cnp;
 3716         char *nulchar;
 3717         struct vnode *dvp;
 3718         struct vnode *tvp;
 3719         seqc_t dvp_seqc;
 3720         seqc_t tvp_seqc;
 3721         uint32_t hash;
 3722         struct nameidata_saved snd;
 3723         struct nameidata_outer snd_outer;
 3724         int line;
 3725         enum cache_fpl_status status:8;
 3726         bool in_smr;
 3727         bool fsearch;
 3728         bool savename;
 3729         struct pwd **pwd;
 3730 #ifdef INVARIANTS
 3731         struct cache_fpl_debug debug;
 3732 #endif
 3733 };
 3734 
 3735 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
 3736 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
 3737 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
 3738 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
 3739 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
 3740 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
 3741 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
 3742 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
 3743 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
 3744 
 3745 static void
 3746 cache_fpl_cleanup_cnp(struct componentname *cnp)
 3747 {
 3748 
 3749         uma_zfree(namei_zone, cnp->cn_pnbuf);
 3750 #ifdef DIAGNOSTIC
 3751         cnp->cn_pnbuf = NULL;
 3752         cnp->cn_nameptr = NULL;
 3753 #endif
 3754 }
 3755 
 3756 static struct vnode *
 3757 cache_fpl_handle_root(struct cache_fpl *fpl)
 3758 {
 3759         struct nameidata *ndp;
 3760         struct componentname *cnp;
 3761 
 3762         ndp = fpl->ndp;
 3763         cnp = fpl->cnp;
 3764 
 3765         MPASS(*(cnp->cn_nameptr) == '/');
 3766         cnp->cn_nameptr++;
 3767         cache_fpl_pathlen_dec(fpl);
 3768 
 3769         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 3770                 do {
 3771                         cnp->cn_nameptr++;
 3772                         cache_fpl_pathlen_dec(fpl);
 3773                 } while (*(cnp->cn_nameptr) == '/');
 3774         }
 3775 
 3776         return (ndp->ni_rootdir);
 3777 }
 3778 
 3779 static void
 3780 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
 3781 {
 3782 
 3783         fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
 3784         fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
 3785 }
 3786 
 3787 static void
 3788 cache_fpl_checkpoint(struct cache_fpl *fpl)
 3789 {
 3790 
 3791 #ifdef INVARIANTS
 3792         fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
 3793         fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
 3794 #endif
 3795 }
 3796 
 3797 static void
 3798 cache_fpl_restore_partial(struct cache_fpl *fpl)
 3799 {
 3800 
 3801         fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
 3802 #ifdef INVARIANTS
 3803         fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
 3804 #endif
 3805 }
 3806 
 3807 static void
 3808 cache_fpl_restore_abort(struct cache_fpl *fpl)
 3809 {
 3810 
 3811         cache_fpl_restore_partial(fpl);
 3812         /*
 3813          * It is 0 on entry by API contract.
 3814          */
 3815         fpl->ndp->ni_resflags = 0;
 3816         fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
 3817         fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
 3818 }
 3819 
 3820 #ifdef INVARIANTS
 3821 #define cache_fpl_smr_assert_entered(fpl) ({                    \
 3822         struct cache_fpl *_fpl = (fpl);                         \
 3823         MPASS(_fpl->in_smr == true);                            \
 3824         VFS_SMR_ASSERT_ENTERED();                               \
 3825 })
 3826 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
 3827         struct cache_fpl *_fpl = (fpl);                         \
 3828         MPASS(_fpl->in_smr == false);                           \
 3829         VFS_SMR_ASSERT_NOT_ENTERED();                           \
 3830 })
 3831 static void
 3832 cache_fpl_assert_status(struct cache_fpl *fpl)
 3833 {
 3834 
 3835         switch (fpl->status) {
 3836         case CACHE_FPL_STATUS_UNSET:
 3837                 __assert_unreachable();
 3838                 break;
 3839         case CACHE_FPL_STATUS_DESTROYED:
 3840         case CACHE_FPL_STATUS_ABORTED:
 3841         case CACHE_FPL_STATUS_PARTIAL:
 3842         case CACHE_FPL_STATUS_HANDLED:
 3843                 break;
 3844         }
 3845 }
 3846 #else
 3847 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
 3848 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
 3849 #define cache_fpl_assert_status(fpl) do { } while (0)
 3850 #endif
 3851 
 3852 #define cache_fpl_smr_enter_initial(fpl) ({                     \
 3853         struct cache_fpl *_fpl = (fpl);                         \
 3854         vfs_smr_enter();                                        \
 3855         _fpl->in_smr = true;                                    \
 3856 })
 3857 
 3858 #define cache_fpl_smr_enter(fpl) ({                             \
 3859         struct cache_fpl *_fpl = (fpl);                         \
 3860         MPASS(_fpl->in_smr == false);                           \
 3861         vfs_smr_enter();                                        \
 3862         _fpl->in_smr = true;                                    \
 3863 })
 3864 
 3865 #define cache_fpl_smr_exit(fpl) ({                              \
 3866         struct cache_fpl *_fpl = (fpl);                         \
 3867         MPASS(_fpl->in_smr == true);                            \
 3868         vfs_smr_exit();                                         \
 3869         _fpl->in_smr = false;                                   \
 3870 })
 3871 
 3872 static int
 3873 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
 3874 {
 3875 
 3876         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 3877                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 3878                     ("%s: converting to abort from %d at %d, set at %d\n",
 3879                     __func__, fpl->status, line, fpl->line));
 3880         }
 3881         cache_fpl_smr_assert_not_entered(fpl);
 3882         fpl->status = CACHE_FPL_STATUS_ABORTED;
 3883         fpl->line = line;
 3884         return (CACHE_FPL_FAILED);
 3885 }
 3886 
 3887 #define cache_fpl_aborted_early(x)      cache_fpl_aborted_early_impl((x), __LINE__)
 3888 
 3889 static int __noinline
 3890 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 3891 {
 3892         struct nameidata *ndp;
 3893         struct componentname *cnp;
 3894 
 3895         ndp = fpl->ndp;
 3896         cnp = fpl->cnp;
 3897 
 3898         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 3899                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 3900                     ("%s: converting to abort from %d at %d, set at %d\n",
 3901                     __func__, fpl->status, line, fpl->line));
 3902         }
 3903         fpl->status = CACHE_FPL_STATUS_ABORTED;
 3904         fpl->line = line;
 3905         if (fpl->in_smr)
 3906                 cache_fpl_smr_exit(fpl);
 3907         cache_fpl_restore_abort(fpl);
 3908         /*
 3909          * Resolving symlinks overwrites data passed by the caller.
 3910          * Let namei know.
 3911          */
 3912         if (ndp->ni_loopcnt > 0) {
 3913                 fpl->status = CACHE_FPL_STATUS_DESTROYED;
 3914                 cache_fpl_cleanup_cnp(cnp);
 3915         }
 3916         return (CACHE_FPL_FAILED);
 3917 }
 3918 
 3919 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
 3920 
 3921 static int __noinline
 3922 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
 3923 {
 3924 
 3925         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 3926             ("%s: setting to partial at %d, but already set to %d at %d\n",
 3927             __func__, line, fpl->status, fpl->line));
 3928         cache_fpl_smr_assert_entered(fpl);
 3929         fpl->status = CACHE_FPL_STATUS_PARTIAL;
 3930         fpl->line = line;
 3931         return (cache_fplookup_partial_setup(fpl));
 3932 }
 3933 
 3934 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
 3935 
 3936 static int
 3937 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
 3938 {
 3939 
 3940         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 3941             ("%s: setting to handled at %d, but already set to %d at %d\n",
 3942             __func__, line, fpl->status, fpl->line));
 3943         cache_fpl_smr_assert_not_entered(fpl);
 3944         fpl->status = CACHE_FPL_STATUS_HANDLED;
 3945         fpl->line = line;
 3946         return (0);
 3947 }
 3948 
 3949 #define cache_fpl_handled(x)    cache_fpl_handled_impl((x), __LINE__)
 3950 
 3951 static int
 3952 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
 3953 {
 3954 
 3955         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 3956             ("%s: setting to handled at %d, but already set to %d at %d\n",
 3957             __func__, line, fpl->status, fpl->line));
 3958         MPASS(error != 0);
 3959         MPASS(error != CACHE_FPL_FAILED);
 3960         cache_fpl_smr_assert_not_entered(fpl);
 3961         fpl->status = CACHE_FPL_STATUS_HANDLED;
 3962         fpl->line = line;
 3963         fpl->dvp = NULL;
 3964         fpl->tvp = NULL;
 3965         fpl->savename = false;
 3966         return (error);
 3967 }
 3968 
 3969 #define cache_fpl_handled_error(x, e)   cache_fpl_handled_error_impl((x), (e), __LINE__)
 3970 
 3971 static bool
 3972 cache_fpl_terminated(struct cache_fpl *fpl)
 3973 {
 3974 
 3975         return (fpl->status != CACHE_FPL_STATUS_UNSET);
 3976 }
 3977 
 3978 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
 3979         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
 3980          FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \
 3981          ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
 3982 
 3983 #define CACHE_FPL_INTERNAL_CN_FLAGS \
 3984         (ISDOTDOT | MAKEENTRY | ISLASTCN)
 3985 
 3986 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 3987     "supported and internal flags overlap");
 3988 
 3989 static bool
 3990 cache_fpl_islastcn(struct nameidata *ndp)
 3991 {
 3992 
 3993         return (*ndp->ni_next == 0);
 3994 }
 3995 
 3996 static bool
 3997 cache_fpl_istrailingslash(struct cache_fpl *fpl)
 3998 {
 3999 
 4000         return (*(fpl->nulchar - 1) == '/');
 4001 }
 4002 
 4003 static bool
 4004 cache_fpl_isdotdot(struct componentname *cnp)
 4005 {
 4006 
 4007         if (cnp->cn_namelen == 2 &&
 4008             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 4009                 return (true);
 4010         return (false);
 4011 }
 4012 
 4013 static bool
 4014 cache_can_fplookup(struct cache_fpl *fpl)
 4015 {
 4016         struct nameidata *ndp;
 4017         struct componentname *cnp;
 4018         struct thread *td;
 4019 
 4020         ndp = fpl->ndp;
 4021         cnp = fpl->cnp;
 4022         td = cnp->cn_thread;
 4023 
 4024         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 4025                 cache_fpl_aborted_early(fpl);
 4026                 return (false);
 4027         }
 4028         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
 4029                 cache_fpl_aborted_early(fpl);
 4030                 return (false);
 4031         }
 4032         if (IN_CAPABILITY_MODE(td)) {
 4033                 cache_fpl_aborted_early(fpl);
 4034                 return (false);
 4035         }
 4036         if (AUDITING_TD(td)) {
 4037                 cache_fpl_aborted_early(fpl);
 4038                 return (false);
 4039         }
 4040         if (ndp->ni_startdir != NULL) {
 4041                 cache_fpl_aborted_early(fpl);
 4042                 return (false);
 4043         }
 4044         return (true);
 4045 }
 4046 
 4047 static int
 4048 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
 4049 {
 4050         struct nameidata *ndp;
 4051         int error;
 4052         bool fsearch;
 4053 
 4054         ndp = fpl->ndp;
 4055         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
 4056         if (__predict_false(error != 0)) {
 4057                 return (cache_fpl_aborted(fpl));
 4058         }
 4059         fpl->fsearch = fsearch;
 4060         return (0);
 4061 }
 4062 
 4063 static int __noinline
 4064 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
 4065     uint32_t hash)
 4066 {
 4067         struct componentname *cnp;
 4068         struct vnode *dvp;
 4069 
 4070         cnp = fpl->cnp;
 4071         dvp = fpl->dvp;
 4072 
 4073         cache_fpl_smr_exit(fpl);
 4074         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
 4075                 return (cache_fpl_handled_error(fpl, ENOENT));
 4076         else
 4077                 return (cache_fpl_aborted(fpl));
 4078 }
 4079 
 4080 /*
 4081  * The target vnode is not supported, prepare for the slow path to take over.
 4082  */
 4083 static int __noinline
 4084 cache_fplookup_partial_setup(struct cache_fpl *fpl)
 4085 {
 4086         struct nameidata *ndp;
 4087         struct componentname *cnp;
 4088         enum vgetstate dvs;
 4089         struct vnode *dvp;
 4090         struct pwd *pwd;
 4091         seqc_t dvp_seqc;
 4092 
 4093         ndp = fpl->ndp;
 4094         cnp = fpl->cnp;
 4095         pwd = *(fpl->pwd);
 4096         dvp = fpl->dvp;
 4097         dvp_seqc = fpl->dvp_seqc;
 4098 
 4099         if (!pwd_hold_smr(pwd)) {
 4100                 return (cache_fpl_aborted(fpl));
 4101         }
 4102 
 4103         /*
 4104          * Note that seqc is checked before the vnode is locked, so by
 4105          * the time regular lookup gets to it it may have moved.
 4106          *
 4107          * Ultimately this does not affect correctness, any lookup errors
 4108          * are userspace racing with itself. It is guaranteed that any
 4109          * path which ultimately gets found could also have been found
 4110          * by regular lookup going all the way in absence of concurrent
 4111          * modifications.
 4112          */
 4113         dvs = vget_prep_smr(dvp);
 4114         cache_fpl_smr_exit(fpl);
 4115         if (__predict_false(dvs == VGET_NONE)) {
 4116                 pwd_drop(pwd);
 4117                 return (cache_fpl_aborted(fpl));
 4118         }
 4119 
 4120         vget_finish_ref(dvp, dvs);
 4121         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4122                 vrele(dvp);
 4123                 pwd_drop(pwd);
 4124                 return (cache_fpl_aborted(fpl));
 4125         }
 4126 
 4127         cache_fpl_restore_partial(fpl);
 4128 #ifdef INVARIANTS
 4129         if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
 4130                 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
 4131                     cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
 4132         }
 4133 #endif
 4134 
 4135         ndp->ni_startdir = dvp;
 4136         cnp->cn_flags |= MAKEENTRY;
 4137         if (cache_fpl_islastcn(ndp))
 4138                 cnp->cn_flags |= ISLASTCN;
 4139         if (cache_fpl_isdotdot(cnp))
 4140                 cnp->cn_flags |= ISDOTDOT;
 4141 
 4142         /*
 4143          * Skip potential extra slashes parsing did not take care of.
 4144          * cache_fplookup_skip_slashes explains the mechanism.
 4145          */
 4146         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 4147                 do {
 4148                         cnp->cn_nameptr++;
 4149                         cache_fpl_pathlen_dec(fpl);
 4150                 } while (*(cnp->cn_nameptr) == '/');
 4151         }
 4152 
 4153         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 4154 #ifdef INVARIANTS
 4155         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 4156                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 4157                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 4158                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 4159         }
 4160 #endif
 4161         return (0);
 4162 }
 4163 
 4164 static int
 4165 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
 4166 {
 4167         struct componentname *cnp;
 4168         struct vnode *tvp;
 4169         seqc_t tvp_seqc;
 4170         int error, lkflags;
 4171 
 4172         cnp = fpl->cnp;
 4173         tvp = fpl->tvp;
 4174         tvp_seqc = fpl->tvp_seqc;
 4175 
 4176         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4177                 lkflags = LK_SHARED;
 4178                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4179                         lkflags = LK_EXCLUSIVE;
 4180                 error = vget_finish(tvp, lkflags, tvs);
 4181                 if (__predict_false(error != 0)) {
 4182                         return (cache_fpl_aborted(fpl));
 4183                 }
 4184         } else {
 4185                 vget_finish_ref(tvp, tvs);
 4186         }
 4187 
 4188         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 4189                 if ((cnp->cn_flags & LOCKLEAF) != 0)
 4190                         vput(tvp);
 4191                 else
 4192                         vrele(tvp);
 4193                 return (cache_fpl_aborted(fpl));
 4194         }
 4195 
 4196         return (cache_fpl_handled(fpl));
 4197 }
 4198 
 4199 /*
 4200  * They want to possibly modify the state of the namecache.
 4201  */
 4202 static int __noinline
 4203 cache_fplookup_final_modifying(struct cache_fpl *fpl)
 4204 {
 4205         struct nameidata *ndp;
 4206         struct componentname *cnp;
 4207         enum vgetstate dvs;
 4208         struct vnode *dvp, *tvp;
 4209         struct mount *mp;
 4210         seqc_t dvp_seqc;
 4211         int error;
 4212         bool docache;
 4213 
 4214         ndp = fpl->ndp;
 4215         cnp = fpl->cnp;
 4216         dvp = fpl->dvp;
 4217         dvp_seqc = fpl->dvp_seqc;
 4218 
 4219         MPASS(*(cnp->cn_nameptr) != '/');
 4220         MPASS(cache_fpl_islastcn(ndp));
 4221         if ((cnp->cn_flags & LOCKPARENT) == 0)
 4222                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
 4223         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
 4224         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
 4225             cnp->cn_nameiop == RENAME);
 4226         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4227         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4228 
 4229         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 4230         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
 4231                 docache = false;
 4232 
 4233         /*
 4234          * Regular lookup nulifies the slash, which we don't do here.
 4235          * Don't take chances with filesystem routines seeing it for
 4236          * the last entry.
 4237          */
 4238         if (cache_fpl_istrailingslash(fpl)) {
 4239                 return (cache_fpl_partial(fpl));
 4240         }
 4241 
 4242         mp = atomic_load_ptr(&dvp->v_mount);
 4243         if (__predict_false(mp == NULL)) {
 4244                 return (cache_fpl_aborted(fpl));
 4245         }
 4246 
 4247         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
 4248                 cache_fpl_smr_exit(fpl);
 4249                 /*
 4250                  * Original code keeps not checking for CREATE which
 4251                  * might be a bug. For now let the old lookup decide.
 4252                  */
 4253                 if (cnp->cn_nameiop == CREATE) {
 4254                         return (cache_fpl_aborted(fpl));
 4255                 }
 4256                 return (cache_fpl_handled_error(fpl, EROFS));
 4257         }
 4258 
 4259         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
 4260                 cache_fpl_smr_exit(fpl);
 4261                 return (cache_fpl_handled_error(fpl, EEXIST));
 4262         }
 4263 
 4264         /*
 4265          * Secure access to dvp; check cache_fplookup_partial_setup for
 4266          * reasoning.
 4267          *
 4268          * XXX At least UFS requires its lookup routine to be called for
 4269          * the last path component, which leads to some level of complication
 4270          * and inefficiency:
 4271          * - the target routine always locks the target vnode, but our caller
 4272          *   may not need it locked
 4273          * - some of the VOP machinery asserts that the parent is locked, which
 4274          *   once more may be not required
 4275          *
 4276          * TODO: add a flag for filesystems which don't need this.
 4277          */
 4278         dvs = vget_prep_smr(dvp);
 4279         cache_fpl_smr_exit(fpl);
 4280         if (__predict_false(dvs == VGET_NONE)) {
 4281                 return (cache_fpl_aborted(fpl));
 4282         }
 4283 
 4284         vget_finish_ref(dvp, dvs);
 4285         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4286                 vrele(dvp);
 4287                 return (cache_fpl_aborted(fpl));
 4288         }
 4289 
 4290         error = vn_lock(dvp, LK_EXCLUSIVE);
 4291         if (__predict_false(error != 0)) {
 4292                 vrele(dvp);
 4293                 return (cache_fpl_aborted(fpl));
 4294         }
 4295 
 4296         tvp = NULL;
 4297         cnp->cn_flags |= ISLASTCN;
 4298         if (docache)
 4299                 cnp->cn_flags |= MAKEENTRY;
 4300         if (cache_fpl_isdotdot(cnp))
 4301                 cnp->cn_flags |= ISDOTDOT;
 4302         cnp->cn_lkflags = LK_EXCLUSIVE;
 4303         error = VOP_LOOKUP(dvp, &tvp, cnp);
 4304         switch (error) {
 4305         case EJUSTRETURN:
 4306         case 0:
 4307                 break;
 4308         case ENOTDIR:
 4309         case ENOENT:
 4310                 vput(dvp);
 4311                 return (cache_fpl_handled_error(fpl, error));
 4312         default:
 4313                 vput(dvp);
 4314                 return (cache_fpl_aborted(fpl));
 4315         }
 4316 
 4317         fpl->tvp = tvp;
 4318         fpl->savename = (cnp->cn_flags & SAVENAME) != 0;
 4319 
 4320         if (tvp == NULL) {
 4321                 if ((cnp->cn_flags & SAVESTART) != 0) {
 4322                         ndp->ni_startdir = dvp;
 4323                         vrefact(ndp->ni_startdir);
 4324                         cnp->cn_flags |= SAVENAME;
 4325                         fpl->savename = true;
 4326                 }
 4327                 MPASS(error == EJUSTRETURN);
 4328                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4329                         VOP_UNLOCK(dvp);
 4330                 }
 4331                 return (cache_fpl_handled(fpl));
 4332         }
 4333 
 4334         /*
 4335          * There are very hairy corner cases concerning various flag combinations
 4336          * and locking state. In particular here we only hold one lock instead of
 4337          * two.
 4338          *
 4339          * Skip the complexity as it is of no significance for normal workloads.
 4340          */
 4341         if (__predict_false(tvp == dvp)) {
 4342                 vput(dvp);
 4343                 vrele(tvp);
 4344                 return (cache_fpl_aborted(fpl));
 4345         }
 4346 
 4347         /*
 4348          * If they want the symlink itself we are fine, but if they want to
 4349          * follow it regular lookup has to be engaged.
 4350          */
 4351         if (tvp->v_type == VLNK) {
 4352                 if ((cnp->cn_flags & FOLLOW) != 0) {
 4353                         vput(dvp);
 4354                         vput(tvp);
 4355                         return (cache_fpl_aborted(fpl));
 4356                 }
 4357         }
 4358 
 4359         /*
 4360          * Since we expect this to be the terminal vnode it should almost never
 4361          * be a mount point.
 4362          */
 4363         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 4364                 vput(dvp);
 4365                 vput(tvp);
 4366                 return (cache_fpl_aborted(fpl));
 4367         }
 4368 
 4369         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
 4370                 vput(dvp);
 4371                 vput(tvp);
 4372                 return (cache_fpl_handled_error(fpl, EEXIST));
 4373         }
 4374 
 4375         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 4376                 VOP_UNLOCK(tvp);
 4377         }
 4378 
 4379         if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4380                 VOP_UNLOCK(dvp);
 4381         }
 4382 
 4383         if ((cnp->cn_flags & SAVESTART) != 0) {
 4384                 ndp->ni_startdir = dvp;
 4385                 vrefact(ndp->ni_startdir);
 4386                 cnp->cn_flags |= SAVENAME;
 4387                 fpl->savename = true;
 4388         }
 4389 
 4390         return (cache_fpl_handled(fpl));
 4391 }
 4392 
 4393 static int __noinline
 4394 cache_fplookup_modifying(struct cache_fpl *fpl)
 4395 {
 4396         struct nameidata *ndp;
 4397 
 4398         ndp = fpl->ndp;
 4399 
 4400         if (!cache_fpl_islastcn(ndp)) {
 4401                 return (cache_fpl_partial(fpl));
 4402         }
 4403         return (cache_fplookup_final_modifying(fpl));
 4404 }
 4405 
 4406 static int __noinline
 4407 cache_fplookup_final_withparent(struct cache_fpl *fpl)
 4408 {
 4409         struct componentname *cnp;
 4410         enum vgetstate dvs, tvs;
 4411         struct vnode *dvp, *tvp;
 4412         seqc_t dvp_seqc;
 4413         int error;
 4414 
 4415         cnp = fpl->cnp;
 4416         dvp = fpl->dvp;
 4417         dvp_seqc = fpl->dvp_seqc;
 4418         tvp = fpl->tvp;
 4419 
 4420         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
 4421 
 4422         /*
 4423          * This is less efficient than it can be for simplicity.
 4424          */
 4425         dvs = vget_prep_smr(dvp);
 4426         if (__predict_false(dvs == VGET_NONE)) {
 4427                 return (cache_fpl_aborted(fpl));
 4428         }
 4429         tvs = vget_prep_smr(tvp);
 4430         if (__predict_false(tvs == VGET_NONE)) {
 4431                 cache_fpl_smr_exit(fpl);
 4432                 vget_abort(dvp, dvs);
 4433                 return (cache_fpl_aborted(fpl));
 4434         }
 4435 
 4436         cache_fpl_smr_exit(fpl);
 4437 
 4438         if ((cnp->cn_flags & LOCKPARENT) != 0) {
 4439                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
 4440                 if (__predict_false(error != 0)) {
 4441                         vget_abort(tvp, tvs);
 4442                         return (cache_fpl_aborted(fpl));
 4443                 }
 4444         } else {
 4445                 vget_finish_ref(dvp, dvs);
 4446         }
 4447 
 4448         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4449                 vget_abort(tvp, tvs);
 4450                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4451                         vput(dvp);
 4452                 else
 4453                         vrele(dvp);
 4454                 return (cache_fpl_aborted(fpl));
 4455         }
 4456 
 4457         error = cache_fplookup_final_child(fpl, tvs);
 4458         if (__predict_false(error != 0)) {
 4459                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
 4460                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4461                         vput(dvp);
 4462                 else
 4463                         vrele(dvp);
 4464                 return (error);
 4465         }
 4466 
 4467         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
 4468         return (0);
 4469 }
 4470 
 4471 static int
 4472 cache_fplookup_final(struct cache_fpl *fpl)
 4473 {
 4474         struct componentname *cnp;
 4475         enum vgetstate tvs;
 4476         struct vnode *dvp, *tvp;
 4477         seqc_t dvp_seqc;
 4478 
 4479         cnp = fpl->cnp;
 4480         dvp = fpl->dvp;
 4481         dvp_seqc = fpl->dvp_seqc;
 4482         tvp = fpl->tvp;
 4483 
 4484         MPASS(*(cnp->cn_nameptr) != '/');
 4485 
 4486         if (cnp->cn_nameiop != LOOKUP) {
 4487                 return (cache_fplookup_final_modifying(fpl));
 4488         }
 4489 
 4490         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
 4491                 return (cache_fplookup_final_withparent(fpl));
 4492 
 4493         tvs = vget_prep_smr(tvp);
 4494         if (__predict_false(tvs == VGET_NONE)) {
 4495                 return (cache_fpl_partial(fpl));
 4496         }
 4497 
 4498         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4499                 cache_fpl_smr_exit(fpl);
 4500                 vget_abort(tvp, tvs);
 4501                 return (cache_fpl_aborted(fpl));
 4502         }
 4503 
 4504         cache_fpl_smr_exit(fpl);
 4505         return (cache_fplookup_final_child(fpl, tvs));
 4506 }
 4507 
 4508 /*
 4509  * Comment from locked lookup:
 4510  * Check for degenerate name (e.g. / or "") which is a way of talking about a
 4511  * directory, e.g. like "/." or ".".
 4512  */
 4513 static int __noinline
 4514 cache_fplookup_degenerate(struct cache_fpl *fpl)
 4515 {
 4516         struct componentname *cnp;
 4517         struct vnode *dvp;
 4518         enum vgetstate dvs;
 4519         int error, lkflags;
 4520 #ifdef INVARIANTS
 4521         char *cp;
 4522 #endif
 4523 
 4524         fpl->tvp = fpl->dvp;
 4525         fpl->tvp_seqc = fpl->dvp_seqc;
 4526 
 4527         cnp = fpl->cnp;
 4528         dvp = fpl->dvp;
 4529 
 4530 #ifdef INVARIANTS
 4531         for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
 4532                 KASSERT(*cp == '/',
 4533                     ("%s: encountered non-slash; string [%s]\n", __func__,
 4534                     cnp->cn_pnbuf));
 4535         }
 4536 #endif
 4537 
 4538         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
 4539                 cache_fpl_smr_exit(fpl);
 4540                 return (cache_fpl_handled_error(fpl, EISDIR));
 4541         }
 4542 
 4543         MPASS((cnp->cn_flags & SAVESTART) == 0);
 4544 
 4545         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
 4546                 return (cache_fplookup_final_withparent(fpl));
 4547         }
 4548 
 4549         dvs = vget_prep_smr(dvp);
 4550         cache_fpl_smr_exit(fpl);
 4551         if (__predict_false(dvs == VGET_NONE)) {
 4552                 return (cache_fpl_aborted(fpl));
 4553         }
 4554 
 4555         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4556                 lkflags = LK_SHARED;
 4557                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4558                         lkflags = LK_EXCLUSIVE;
 4559                 error = vget_finish(dvp, lkflags, dvs);
 4560                 if (__predict_false(error != 0)) {
 4561                         return (cache_fpl_aborted(fpl));
 4562                 }
 4563         } else {
 4564                 vget_finish_ref(dvp, dvs);
 4565         }
 4566         return (cache_fpl_handled(fpl));
 4567 }
 4568 
 4569 static int __noinline
 4570 cache_fplookup_noentry(struct cache_fpl *fpl)
 4571 {
 4572         struct nameidata *ndp;
 4573         struct componentname *cnp;
 4574         enum vgetstate dvs;
 4575         struct vnode *dvp, *tvp;
 4576         seqc_t dvp_seqc;
 4577         int error;
 4578         bool docache;
 4579 
 4580         ndp = fpl->ndp;
 4581         cnp = fpl->cnp;
 4582         dvp = fpl->dvp;
 4583         dvp_seqc = fpl->dvp_seqc;
 4584 
 4585         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4586         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4587         MPASS(!cache_fpl_isdotdot(cnp));
 4588 
 4589         /*
 4590          * Hack: delayed name len checking.
 4591          */
 4592         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 4593                 cache_fpl_smr_exit(fpl);
 4594                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 4595         }
 4596 
 4597         if (cnp->cn_nameptr[0] == '/') {
 4598                 return (cache_fplookup_skip_slashes(fpl));
 4599         }
 4600 
 4601         if (cnp->cn_nameptr[0] == '\0') {
 4602                 if (fpl->tvp == NULL) {
 4603                         return (cache_fplookup_degenerate(fpl));
 4604                 }
 4605                 return (cache_fplookup_trailingslash(fpl));
 4606         }
 4607 
 4608         if (cnp->cn_nameiop != LOOKUP) {
 4609                 fpl->tvp = NULL;
 4610                 return (cache_fplookup_modifying(fpl));
 4611         }
 4612 
 4613         MPASS((cnp->cn_flags & SAVESTART) == 0);
 4614 
 4615         /*
 4616          * Only try to fill in the component if it is the last one,
 4617          * otherwise not only there may be several to handle but the
 4618          * walk may be complicated.
 4619          */
 4620         if (!cache_fpl_islastcn(ndp)) {
 4621                 return (cache_fpl_partial(fpl));
 4622         }
 4623 
 4624         /*
 4625          * Regular lookup nulifies the slash, which we don't do here.
 4626          * Don't take chances with filesystem routines seeing it for
 4627          * the last entry.
 4628          */
 4629         if (cache_fpl_istrailingslash(fpl)) {
 4630                 return (cache_fpl_partial(fpl));
 4631         }
 4632 
 4633         /*
 4634          * Secure access to dvp; check cache_fplookup_partial_setup for
 4635          * reasoning.
 4636          */
 4637         dvs = vget_prep_smr(dvp);
 4638         cache_fpl_smr_exit(fpl);
 4639         if (__predict_false(dvs == VGET_NONE)) {
 4640                 return (cache_fpl_aborted(fpl));
 4641         }
 4642 
 4643         vget_finish_ref(dvp, dvs);
 4644         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4645                 vrele(dvp);
 4646                 return (cache_fpl_aborted(fpl));
 4647         }
 4648 
 4649         error = vn_lock(dvp, LK_SHARED);
 4650         if (__predict_false(error != 0)) {
 4651                 vrele(dvp);
 4652                 return (cache_fpl_aborted(fpl));
 4653         }
 4654 
 4655         tvp = NULL;
 4656         /*
 4657          * TODO: provide variants which don't require locking either vnode.
 4658          */
 4659         cnp->cn_flags |= ISLASTCN;
 4660         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 4661         if (docache)
 4662                 cnp->cn_flags |= MAKEENTRY;
 4663         cnp->cn_lkflags = LK_SHARED;
 4664         if ((cnp->cn_flags & LOCKSHARED) == 0) {
 4665                 cnp->cn_lkflags = LK_EXCLUSIVE;
 4666         }
 4667         error = VOP_LOOKUP(dvp, &tvp, cnp);
 4668         switch (error) {
 4669         case EJUSTRETURN:
 4670         case 0:
 4671                 break;
 4672         case ENOTDIR:
 4673         case ENOENT:
 4674                 vput(dvp);
 4675                 return (cache_fpl_handled_error(fpl, error));
 4676         default:
 4677                 vput(dvp);
 4678                 return (cache_fpl_aborted(fpl));
 4679         }
 4680 
 4681         fpl->tvp = tvp;
 4682         if (!fpl->savename) {
 4683                 MPASS((cnp->cn_flags & SAVENAME) == 0);
 4684         }
 4685 
 4686         if (tvp == NULL) {
 4687                 MPASS(error == EJUSTRETURN);
 4688                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 4689                         vput(dvp);
 4690                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4691                         VOP_UNLOCK(dvp);
 4692                 }
 4693                 return (cache_fpl_handled(fpl));
 4694         }
 4695 
 4696         if (tvp->v_type == VLNK) {
 4697                 if ((cnp->cn_flags & FOLLOW) != 0) {
 4698                         vput(dvp);
 4699                         vput(tvp);
 4700                         return (cache_fpl_aborted(fpl));
 4701                 }
 4702         }
 4703 
 4704         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 4705                 vput(dvp);
 4706                 vput(tvp);
 4707                 return (cache_fpl_aborted(fpl));
 4708         }
 4709 
 4710         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 4711                 VOP_UNLOCK(tvp);
 4712         }
 4713 
 4714         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 4715                 vput(dvp);
 4716         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4717                 VOP_UNLOCK(dvp);
 4718         }
 4719         return (cache_fpl_handled(fpl));
 4720 }
 4721 
 4722 static int __noinline
 4723 cache_fplookup_dot(struct cache_fpl *fpl)
 4724 {
 4725         int error;
 4726 
 4727         MPASS(!seqc_in_modify(fpl->dvp_seqc));
 4728         /*
 4729          * Just re-assign the value. seqc will be checked later for the first
 4730          * non-dot path component in line and/or before deciding to return the
 4731          * vnode.
 4732          */
 4733         fpl->tvp = fpl->dvp;
 4734         fpl->tvp_seqc = fpl->dvp_seqc;
 4735 
 4736         counter_u64_add(dothits, 1);
 4737         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
 4738 
 4739         error = 0;
 4740         if (cache_fplookup_is_mp(fpl)) {
 4741                 error = cache_fplookup_cross_mount(fpl);
 4742         }
 4743         return (error);
 4744 }
 4745 
 4746 static int __noinline
 4747 cache_fplookup_dotdot(struct cache_fpl *fpl)
 4748 {
 4749         struct nameidata *ndp;
 4750         struct componentname *cnp;
 4751         struct namecache *ncp;
 4752         struct vnode *dvp;
 4753         struct prison *pr;
 4754         u_char nc_flag;
 4755 
 4756         ndp = fpl->ndp;
 4757         cnp = fpl->cnp;
 4758         dvp = fpl->dvp;
 4759 
 4760         MPASS(cache_fpl_isdotdot(cnp));
 4761 
 4762         /*
 4763          * XXX this is racy the same way regular lookup is
 4764          */
 4765         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 4766             pr = pr->pr_parent)
 4767                 if (dvp == pr->pr_root)
 4768                         break;
 4769 
 4770         if (dvp == ndp->ni_rootdir ||
 4771             dvp == ndp->ni_topdir ||
 4772             dvp == rootvnode ||
 4773             pr != NULL) {
 4774                 fpl->tvp = dvp;
 4775                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
 4776                 if (seqc_in_modify(fpl->tvp_seqc)) {
 4777                         return (cache_fpl_aborted(fpl));
 4778                 }
 4779                 return (0);
 4780         }
 4781 
 4782         if ((dvp->v_vflag & VV_ROOT) != 0) {
 4783                 /*
 4784                  * TODO
 4785                  * The opposite of climb mount is needed here.
 4786                  */
 4787                 return (cache_fpl_partial(fpl));
 4788         }
 4789 
 4790         ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 4791         if (ncp == NULL) {
 4792                 return (cache_fpl_aborted(fpl));
 4793         }
 4794 
 4795         nc_flag = atomic_load_char(&ncp->nc_flag);
 4796         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 4797                 if ((nc_flag & NCF_NEGATIVE) != 0)
 4798                         return (cache_fpl_aborted(fpl));
 4799                 fpl->tvp = ncp->nc_vp;
 4800         } else {
 4801                 fpl->tvp = ncp->nc_dvp;
 4802         }
 4803 
 4804         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
 4805         if (seqc_in_modify(fpl->tvp_seqc)) {
 4806                 return (cache_fpl_partial(fpl));
 4807         }
 4808 
 4809         /*
 4810          * Acquire fence provided by vn_seqc_read_any above.
 4811          */
 4812         if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
 4813                 return (cache_fpl_aborted(fpl));
 4814         }
 4815 
 4816         if (!cache_ncp_canuse(ncp)) {
 4817                 return (cache_fpl_aborted(fpl));
 4818         }
 4819 
 4820         counter_u64_add(dotdothits, 1);
 4821         return (0);
 4822 }
 4823 
 4824 static int __noinline
 4825 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
 4826 {
 4827         u_char nc_flag;
 4828         bool neg_promote;
 4829 
 4830         nc_flag = atomic_load_char(&ncp->nc_flag);
 4831         MPASS((nc_flag & NCF_NEGATIVE) != 0);
 4832         /*
 4833          * If they want to create an entry we need to replace this one.
 4834          */
 4835         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
 4836                 fpl->tvp = NULL;
 4837                 return (cache_fplookup_modifying(fpl));
 4838         }
 4839         neg_promote = cache_neg_hit_prep(ncp);
 4840         if (!cache_fpl_neg_ncp_canuse(ncp)) {
 4841                 cache_neg_hit_abort(ncp);
 4842                 return (cache_fpl_partial(fpl));
 4843         }
 4844         if (neg_promote) {
 4845                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
 4846         }
 4847         cache_neg_hit_finish(ncp);
 4848         cache_fpl_smr_exit(fpl);
 4849         return (cache_fpl_handled_error(fpl, ENOENT));
 4850 }
 4851 
 4852 /*
 4853  * Resolve a symlink. Called by filesystem-specific routines.
 4854  *
 4855  * Code flow is:
 4856  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
 4857  */
 4858 int
 4859 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
 4860 {
 4861         struct nameidata *ndp;
 4862         struct componentname *cnp;
 4863         size_t adjust;
 4864 
 4865         ndp = fpl->ndp;
 4866         cnp = fpl->cnp;
 4867 
 4868         if (__predict_false(len == 0)) {
 4869                 return (ENOENT);
 4870         }
 4871 
 4872         if (__predict_false(len > MAXPATHLEN - 2)) {
 4873                 if (cache_fpl_istrailingslash(fpl)) {
 4874                         return (EAGAIN);
 4875                 }
 4876         }
 4877 
 4878         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
 4879 #ifdef INVARIANTS
 4880         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 4881                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 4882                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 4883                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 4884         }
 4885 #endif
 4886 
 4887         if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
 4888                 return (ENAMETOOLONG);
 4889         }
 4890 
 4891         if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
 4892                 return (ELOOP);
 4893         }
 4894 
 4895         adjust = len;
 4896         if (ndp->ni_pathlen > 1) {
 4897                 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
 4898         } else {
 4899                 if (cache_fpl_istrailingslash(fpl)) {
 4900                         adjust = len + 1;
 4901                         cnp->cn_pnbuf[len] = '/';
 4902                         cnp->cn_pnbuf[len + 1] = '\0';
 4903                 } else {
 4904                         cnp->cn_pnbuf[len] = '\0';
 4905                 }
 4906         }
 4907         bcopy(string, cnp->cn_pnbuf, len);
 4908 
 4909         ndp->ni_pathlen += adjust;
 4910         cache_fpl_pathlen_add(fpl, adjust);
 4911         cnp->cn_nameptr = cnp->cn_pnbuf;
 4912         fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 4913         fpl->tvp = NULL;
 4914         return (0);
 4915 }
 4916 
 4917 static int __noinline
 4918 cache_fplookup_symlink(struct cache_fpl *fpl)
 4919 {
 4920         struct mount *mp;
 4921         struct nameidata *ndp;
 4922         struct componentname *cnp;
 4923         struct vnode *dvp, *tvp;
 4924         int error;
 4925 
 4926         ndp = fpl->ndp;
 4927         cnp = fpl->cnp;
 4928         dvp = fpl->dvp;
 4929         tvp = fpl->tvp;
 4930 
 4931         if (cache_fpl_islastcn(ndp)) {
 4932                 if ((cnp->cn_flags & FOLLOW) == 0) {
 4933                         return (cache_fplookup_final(fpl));
 4934                 }
 4935         }
 4936 
 4937         mp = atomic_load_ptr(&dvp->v_mount);
 4938         if (__predict_false(mp == NULL)) {
 4939                 return (cache_fpl_aborted(fpl));
 4940         }
 4941 
 4942         /*
 4943          * Note this check races against setting the flag just like regular
 4944          * lookup.
 4945          */
 4946         if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
 4947                 cache_fpl_smr_exit(fpl);
 4948                 return (cache_fpl_handled_error(fpl, EACCES));
 4949         }
 4950 
 4951         error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
 4952         if (__predict_false(error != 0)) {
 4953                 switch (error) {
 4954                 case EAGAIN:
 4955                         return (cache_fpl_partial(fpl));
 4956                 case ENOENT:
 4957                 case ENAMETOOLONG:
 4958                 case ELOOP:
 4959                         cache_fpl_smr_exit(fpl);
 4960                         return (cache_fpl_handled_error(fpl, error));
 4961                 default:
 4962                         return (cache_fpl_aborted(fpl));
 4963                 }
 4964         }
 4965 
 4966         if (*(cnp->cn_nameptr) == '/') {
 4967                 fpl->dvp = cache_fpl_handle_root(fpl);
 4968                 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 4969                 if (seqc_in_modify(fpl->dvp_seqc)) {
 4970                         return (cache_fpl_aborted(fpl));
 4971                 }
 4972         }
 4973         return (0);
 4974 }
 4975 
 4976 static int
 4977 cache_fplookup_next(struct cache_fpl *fpl)
 4978 {
 4979         struct componentname *cnp;
 4980         struct namecache *ncp;
 4981         struct vnode *dvp, *tvp;
 4982         u_char nc_flag;
 4983         uint32_t hash;
 4984         int error;
 4985 
 4986         cnp = fpl->cnp;
 4987         dvp = fpl->dvp;
 4988         hash = fpl->hash;
 4989 
 4990         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 4991                 if (cnp->cn_namelen == 1) {
 4992                         return (cache_fplookup_dot(fpl));
 4993                 }
 4994                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 4995                         return (cache_fplookup_dotdot(fpl));
 4996                 }
 4997         }
 4998 
 4999         MPASS(!cache_fpl_isdotdot(cnp));
 5000 
 5001         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 5002                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 5003                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 5004                         break;
 5005         }
 5006 
 5007         if (__predict_false(ncp == NULL)) {
 5008                 return (cache_fplookup_noentry(fpl));
 5009         }
 5010 
 5011         tvp = atomic_load_ptr(&ncp->nc_vp);
 5012         nc_flag = atomic_load_char(&ncp->nc_flag);
 5013         if ((nc_flag & NCF_NEGATIVE) != 0) {
 5014                 return (cache_fplookup_neg(fpl, ncp, hash));
 5015         }
 5016 
 5017         if (!cache_ncp_canuse(ncp)) {
 5018                 return (cache_fpl_partial(fpl));
 5019         }
 5020 
 5021         fpl->tvp = tvp;
 5022         fpl->tvp_seqc = vn_seqc_read_any(tvp);
 5023         if (seqc_in_modify(fpl->tvp_seqc)) {
 5024                 return (cache_fpl_partial(fpl));
 5025         }
 5026 
 5027         counter_u64_add(numposhits, 1);
 5028         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
 5029 
 5030         error = 0;
 5031         if (cache_fplookup_is_mp(fpl)) {
 5032                 error = cache_fplookup_cross_mount(fpl);
 5033         }
 5034         return (error);
 5035 }
 5036 
 5037 static bool
 5038 cache_fplookup_mp_supported(struct mount *mp)
 5039 {
 5040 
 5041         MPASS(mp != NULL);
 5042         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
 5043                 return (false);
 5044         return (true);
 5045 }
 5046 
 5047 /*
 5048  * Walk up the mount stack (if any).
 5049  *
 5050  * Correctness is provided in the following ways:
 5051  * - all vnodes are protected from freeing with SMR
 5052  * - struct mount objects are type stable making them always safe to access
 5053  * - stability of the particular mount is provided by busying it
 5054  * - relationship between the vnode which is mounted on and the mount is
 5055  *   verified with the vnode sequence counter after busying
 5056  * - association between root vnode of the mount and the mount is protected
 5057  *   by busy
 5058  *
 5059  * From that point on we can read the sequence counter of the root vnode
 5060  * and get the next mount on the stack (if any) using the same protection.
 5061  *
 5062  * By the end of successful walk we are guaranteed the reached state was
 5063  * indeed present at least at some point which matches the regular lookup.
 5064  */
 5065 static int __noinline
 5066 cache_fplookup_climb_mount(struct cache_fpl *fpl)
 5067 {
 5068         struct mount *mp, *prev_mp;
 5069         struct mount_pcpu *mpcpu, *prev_mpcpu;
 5070         struct vnode *vp;
 5071         seqc_t vp_seqc;
 5072 
 5073         vp = fpl->tvp;
 5074         vp_seqc = fpl->tvp_seqc;
 5075 
 5076         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
 5077         mp = atomic_load_ptr(&vp->v_mountedhere);
 5078         if (__predict_false(mp == NULL)) {
 5079                 return (0);
 5080         }
 5081 
 5082         prev_mp = NULL;
 5083         for (;;) {
 5084                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5085                         if (prev_mp != NULL)
 5086                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5087                         return (cache_fpl_partial(fpl));
 5088                 }
 5089                 if (prev_mp != NULL)
 5090                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5091                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 5092                         vfs_op_thread_exit_crit(mp, mpcpu);
 5093                         return (cache_fpl_partial(fpl));
 5094                 }
 5095                 if (!cache_fplookup_mp_supported(mp)) {
 5096                         vfs_op_thread_exit_crit(mp, mpcpu);
 5097                         return (cache_fpl_partial(fpl));
 5098                 }
 5099                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5100                 if (vp == NULL) {
 5101                         vfs_op_thread_exit_crit(mp, mpcpu);
 5102                         return (cache_fpl_partial(fpl));
 5103                 }
 5104                 vp_seqc = vn_seqc_read_any(vp);
 5105                 if (seqc_in_modify(vp_seqc)) {
 5106                         vfs_op_thread_exit_crit(mp, mpcpu);
 5107                         return (cache_fpl_partial(fpl));
 5108                 }
 5109                 prev_mp = mp;
 5110                 prev_mpcpu = mpcpu;
 5111                 mp = atomic_load_ptr(&vp->v_mountedhere);
 5112                 if (mp == NULL)
 5113                         break;
 5114         }
 5115 
 5116         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5117         fpl->tvp = vp;
 5118         fpl->tvp_seqc = vp_seqc;
 5119         return (0);
 5120 }
 5121 
 5122 static int __noinline
 5123 cache_fplookup_cross_mount(struct cache_fpl *fpl)
 5124 {
 5125         struct mount *mp;
 5126         struct mount_pcpu *mpcpu;
 5127         struct vnode *vp;
 5128         seqc_t vp_seqc;
 5129 
 5130         vp = fpl->tvp;
 5131         vp_seqc = fpl->tvp_seqc;
 5132 
 5133         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
 5134         mp = atomic_load_ptr(&vp->v_mountedhere);
 5135         if (__predict_false(mp == NULL)) {
 5136                 return (0);
 5137         }
 5138 
 5139         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5140                 return (cache_fpl_partial(fpl));
 5141         }
 5142         if (!vn_seqc_consistent(vp, vp_seqc)) {
 5143                 vfs_op_thread_exit_crit(mp, mpcpu);
 5144                 return (cache_fpl_partial(fpl));
 5145         }
 5146         if (!cache_fplookup_mp_supported(mp)) {
 5147                 vfs_op_thread_exit_crit(mp, mpcpu);
 5148                 return (cache_fpl_partial(fpl));
 5149         }
 5150         vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5151         if (__predict_false(vp == NULL)) {
 5152                 vfs_op_thread_exit_crit(mp, mpcpu);
 5153                 return (cache_fpl_partial(fpl));
 5154         }
 5155         vp_seqc = vn_seqc_read_any(vp);
 5156         vfs_op_thread_exit_crit(mp, mpcpu);
 5157         if (seqc_in_modify(vp_seqc)) {
 5158                 return (cache_fpl_partial(fpl));
 5159         }
 5160         mp = atomic_load_ptr(&vp->v_mountedhere);
 5161         if (__predict_false(mp != NULL)) {
 5162                 /*
 5163                  * There are possibly more mount points on top.
 5164                  * Normally this does not happen so for simplicity just start
 5165                  * over.
 5166                  */
 5167                 return (cache_fplookup_climb_mount(fpl));
 5168         }
 5169 
 5170         fpl->tvp = vp;
 5171         fpl->tvp_seqc = vp_seqc;
 5172         return (0);
 5173 }
 5174 
 5175 /*
 5176  * Check if a vnode is mounted on.
 5177  */
 5178 static bool
 5179 cache_fplookup_is_mp(struct cache_fpl *fpl)
 5180 {
 5181         struct vnode *vp;
 5182 
 5183         vp = fpl->tvp;
 5184         return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
 5185 }
 5186 
 5187 /*
 5188  * Parse the path.
 5189  *
 5190  * The code was originally copy-pasted from regular lookup and despite
 5191  * clean ups leaves performance on the table. Any modifications here
 5192  * must take into account that in case off fallback the resulting
 5193  * nameidata state has to be compatible with the original.
 5194  */
 5195 
 5196 /*
 5197  * Debug ni_pathlen tracking.
 5198  */
 5199 #ifdef INVARIANTS
 5200 static void
 5201 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5202 {
 5203 
 5204         fpl->debug.ni_pathlen += n;
 5205         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5206             ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5207 }
 5208 
 5209 static void
 5210 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5211 {
 5212 
 5213         fpl->debug.ni_pathlen -= n;
 5214         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5215             ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5216 }
 5217 
 5218 static void
 5219 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5220 {
 5221 
 5222         cache_fpl_pathlen_add(fpl, 1);
 5223 }
 5224 
 5225 static void
 5226 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5227 {
 5228 
 5229         cache_fpl_pathlen_sub(fpl, 1);
 5230 }
 5231 #else
 5232 static void
 5233 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5234 {
 5235 }
 5236 
 5237 static void
 5238 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5239 {
 5240 }
 5241 
 5242 static void
 5243 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5244 {
 5245 }
 5246 
 5247 static void
 5248 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5249 {
 5250 }
 5251 #endif
 5252 
 5253 static void
 5254 cache_fplookup_parse(struct cache_fpl *fpl)
 5255 {
 5256         struct nameidata *ndp;
 5257         struct componentname *cnp;
 5258         struct vnode *dvp;
 5259         char *cp;
 5260         uint32_t hash;
 5261 
 5262         ndp = fpl->ndp;
 5263         cnp = fpl->cnp;
 5264         dvp = fpl->dvp;
 5265 
 5266         /*
 5267          * Find the end of this path component, it is either / or nul.
 5268          *
 5269          * Store / as a temporary sentinel so that we only have one character
 5270          * to test for. Pathnames tend to be short so this should not be
 5271          * resulting in cache misses.
 5272          *
 5273          * TODO: fix this to be word-sized.
 5274          */
 5275         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
 5276             ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
 5277             __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
 5278             fpl->nulchar, cnp->cn_pnbuf));
 5279         KASSERT(*fpl->nulchar == '\0',
 5280             ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
 5281             cnp->cn_pnbuf));
 5282         hash = cache_get_hash_iter_start(dvp);
 5283         *fpl->nulchar = '/';
 5284         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
 5285                 KASSERT(*cp != '\0',
 5286                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
 5287                     cnp->cn_nameptr));
 5288                 hash = cache_get_hash_iter(*cp, hash);
 5289                 continue;
 5290         }
 5291         *fpl->nulchar = '\0';
 5292         fpl->hash = cache_get_hash_iter_finish(hash);
 5293 
 5294         cnp->cn_namelen = cp - cnp->cn_nameptr;
 5295         cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
 5296 
 5297 #ifdef INVARIANTS
 5298         if (cnp->cn_namelen <= NAME_MAX) {
 5299                 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
 5300                         panic("%s: mismatched hash for [%s] len %ld", __func__,
 5301                             cnp->cn_nameptr, cnp->cn_namelen);
 5302                 }
 5303         }
 5304 #endif
 5305 
 5306         /*
 5307          * Hack: we have to check if the found path component's length exceeds
 5308          * NAME_MAX. However, the condition is very rarely true and check can
 5309          * be elided in the common case -- if an entry was found in the cache,
 5310          * then it could not have been too long to begin with.
 5311          */
 5312         ndp->ni_next = cp;
 5313 }
 5314 
 5315 static void
 5316 cache_fplookup_parse_advance(struct cache_fpl *fpl)
 5317 {
 5318         struct nameidata *ndp;
 5319         struct componentname *cnp;
 5320 
 5321         ndp = fpl->ndp;
 5322         cnp = fpl->cnp;
 5323 
 5324         cnp->cn_nameptr = ndp->ni_next;
 5325         KASSERT(*(cnp->cn_nameptr) == '/',
 5326             ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
 5327             cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
 5328         cnp->cn_nameptr++;
 5329         cache_fpl_pathlen_dec(fpl);
 5330 }
 5331 
 5332 /*
 5333  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
 5334  *
 5335  * Lockless lookup tries to elide checking for spurious slashes and should they
 5336  * be present is guaranteed to fail to find an entry. In this case the caller
 5337  * must check if the name starts with a slash and call this routine.  It is
 5338  * going to fast forward across the spurious slashes and set the state up for
 5339  * retry.
 5340  */
 5341 static int __noinline
 5342 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
 5343 {
 5344         struct nameidata *ndp;
 5345         struct componentname *cnp;
 5346 
 5347         ndp = fpl->ndp;
 5348         cnp = fpl->cnp;
 5349 
 5350         MPASS(*(cnp->cn_nameptr) == '/');
 5351         do {
 5352                 cnp->cn_nameptr++;
 5353                 cache_fpl_pathlen_dec(fpl);
 5354         } while (*(cnp->cn_nameptr) == '/');
 5355 
 5356         /*
 5357          * Go back to one slash so that cache_fplookup_parse_advance has
 5358          * something to skip.
 5359          */
 5360         cnp->cn_nameptr--;
 5361         cache_fpl_pathlen_inc(fpl);
 5362 
 5363         /*
 5364          * cache_fplookup_parse_advance starts from ndp->ni_next
 5365          */
 5366         ndp->ni_next = cnp->cn_nameptr;
 5367 
 5368         /*
 5369          * See cache_fplookup_dot.
 5370          */
 5371         fpl->tvp = fpl->dvp;
 5372         fpl->tvp_seqc = fpl->dvp_seqc;
 5373 
 5374         return (0);
 5375 }
 5376 
 5377 /*
 5378  * Handle trailing slashes (e.g., "foo/").
 5379  *
 5380  * If a trailing slash is found the terminal vnode must be a directory.
 5381  * Regular lookup shortens the path by nulifying the first trailing slash and
 5382  * sets the TRAILINGSLASH flag to denote this took place. There are several
 5383  * checks on it performed later.
 5384  *
 5385  * Similarly to spurious slashes, lockless lookup handles this in a speculative
 5386  * manner relying on an invariant that a non-directory vnode will get a miss.
 5387  * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
 5388  *
 5389  * Thus for a path like "foo/bar/" the code unwinds the state back to 'bar/'
 5390  * and denotes this is the last path component, which avoids looping back.
 5391  *
 5392  * Only plain lookups are supported for now to restrict corner cases to handle.
 5393  */
 5394 static int __noinline
 5395 cache_fplookup_trailingslash(struct cache_fpl *fpl)
 5396 {
 5397 #ifdef INVARIANTS
 5398         size_t ni_pathlen;
 5399 #endif
 5400         struct nameidata *ndp;
 5401         struct componentname *cnp;
 5402         struct namecache *ncp;
 5403         struct vnode *tvp;
 5404         char *cn_nameptr_orig, *cn_nameptr_slash;
 5405         seqc_t tvp_seqc;
 5406         u_char nc_flag;
 5407 
 5408         ndp = fpl->ndp;
 5409         cnp = fpl->cnp;
 5410         tvp = fpl->tvp;
 5411         tvp_seqc = fpl->tvp_seqc;
 5412 
 5413         MPASS(fpl->dvp == fpl->tvp);
 5414         KASSERT(cache_fpl_istrailingslash(fpl),
 5415             ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
 5416             cnp->cn_pnbuf));
 5417         KASSERT(cnp->cn_nameptr[0] == '\0',
 5418             ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
 5419             cnp->cn_pnbuf));
 5420         KASSERT(cnp->cn_namelen == 0,
 5421             ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
 5422             cnp->cn_pnbuf));
 5423         MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
 5424 
 5425         if (cnp->cn_nameiop != LOOKUP) {
 5426                 return (cache_fpl_aborted(fpl));
 5427         }
 5428 
 5429         if (__predict_false(tvp->v_type != VDIR)) {
 5430                 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 5431                         return (cache_fpl_aborted(fpl));
 5432                 }
 5433                 cache_fpl_smr_exit(fpl);
 5434                 return (cache_fpl_handled_error(fpl, ENOTDIR));
 5435         }
 5436 
 5437         /*
 5438          * Denote the last component.
 5439          */
 5440         ndp->ni_next = &cnp->cn_nameptr[0];
 5441         MPASS(cache_fpl_islastcn(ndp));
 5442 
 5443         /*
 5444          * Unwind trailing slashes.
 5445          */
 5446         cn_nameptr_orig = cnp->cn_nameptr;
 5447         while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
 5448                 cnp->cn_nameptr--;
 5449                 if (cnp->cn_nameptr[0] != '/') {
 5450                         break;
 5451                 }
 5452         }
 5453 
 5454         /*
 5455          * Unwind to the beginning of the path component.
 5456          *
 5457          * Note the path may or may not have started with a slash.
 5458          */
 5459         cn_nameptr_slash = cnp->cn_nameptr;
 5460         while (cnp->cn_nameptr > cnp->cn_pnbuf) {
 5461                 cnp->cn_nameptr--;
 5462                 if (cnp->cn_nameptr[0] == '/') {
 5463                         break;
 5464                 }
 5465         }
 5466         if (cnp->cn_nameptr[0] == '/') {
 5467                 cnp->cn_nameptr++;
 5468         }
 5469 
 5470         cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
 5471         cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
 5472         cache_fpl_checkpoint(fpl);
 5473 
 5474 #ifdef INVARIANTS
 5475         ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 5476         if (ni_pathlen != fpl->debug.ni_pathlen) {
 5477                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 5478                     __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 5479                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 5480         }
 5481 #endif
 5482 
 5483         /*
 5484          * The previous directory is this one.
 5485          */
 5486         if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
 5487                 return (0);
 5488         }
 5489 
 5490         /*
 5491          * The previous directory is something else.
 5492          */
 5493         tvp = fpl->tvp;
 5494         ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
 5495         if (__predict_false(ncp == NULL)) {
 5496                 return (cache_fpl_aborted(fpl));
 5497         }
 5498         nc_flag = atomic_load_char(&ncp->nc_flag);
 5499         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 5500                 return (cache_fpl_aborted(fpl));
 5501         }
 5502         fpl->dvp = ncp->nc_dvp;
 5503         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 5504         if (seqc_in_modify(fpl->dvp_seqc)) {
 5505                 return (cache_fpl_aborted(fpl));
 5506         }
 5507         return (0);
 5508 }
 5509 
 5510 /*
 5511  * See the API contract for VOP_FPLOOKUP_VEXEC.
 5512  */
 5513 static int __noinline
 5514 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
 5515 {
 5516         struct componentname *cnp;
 5517         struct vnode *dvp;
 5518         seqc_t dvp_seqc;
 5519 
 5520         cnp = fpl->cnp;
 5521         dvp = fpl->dvp;
 5522         dvp_seqc = fpl->dvp_seqc;
 5523 
 5524         /*
 5525          * TODO: Due to ignoring slashes lookup will perform a permission check
 5526          * on the last dir when it should not have. If it fails, we get here.
 5527          * It is possible possible to fix it up fully without resorting to
 5528          * regular lookup, but for now just abort.
 5529          */
 5530         if (cache_fpl_istrailingslash(fpl)) {
 5531                 return (cache_fpl_aborted(fpl));
 5532         }
 5533 
 5534         /*
 5535          * Hack: delayed degenerate path checking.
 5536          */
 5537         if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
 5538                 return (cache_fplookup_degenerate(fpl));
 5539         }
 5540 
 5541         /*
 5542          * Hack: delayed name len checking.
 5543          */
 5544         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 5545                 cache_fpl_smr_exit(fpl);
 5546                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 5547         }
 5548 
 5549         /*
 5550          * Hack: they may be looking up foo/bar, where foo is not a directory.
 5551          * In such a case we need to return ENOTDIR, but we may happen to get
 5552          * here with a different error.
 5553          */
 5554         if (dvp->v_type != VDIR) {
 5555                 error = ENOTDIR;
 5556         }
 5557 
 5558         /*
 5559          * Hack: handle O_SEARCH.
 5560          *
 5561          * Open Group Base Specifications Issue 7, 2018 edition states:
 5562          * <quote>
 5563          * If the access mode of the open file description associated with the
 5564          * file descriptor is not O_SEARCH, the function shall check whether
 5565          * directory searches are permitted using the current permissions of
 5566          * the directory underlying the file descriptor. If the access mode is
 5567          * O_SEARCH, the function shall not perform the check.
 5568          * </quote>
 5569          *
 5570          * Regular lookup tests for the NOEXECCHECK flag for every path
 5571          * component to decide whether to do the permission check. However,
 5572          * since most lookups never have the flag (and when they do it is only
 5573          * present for the first path component), lockless lookup only acts on
 5574          * it if there is a permission problem. Here the flag is represented
 5575          * with a boolean so that we don't have to clear it on the way out.
 5576          *
 5577          * For simplicity this always aborts.
 5578          * TODO: check if this is the first lookup and ignore the permission
 5579          * problem. Note the flag has to survive fallback (if it happens to be
 5580          * performed).
 5581          */
 5582         if (fpl->fsearch) {
 5583                 return (cache_fpl_aborted(fpl));
 5584         }
 5585 
 5586         switch (error) {
 5587         case EAGAIN:
 5588                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5589                         error = cache_fpl_aborted(fpl);
 5590                 } else {
 5591                         cache_fpl_partial(fpl);
 5592                 }
 5593                 break;
 5594         default:
 5595                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5596                         error = cache_fpl_aborted(fpl);
 5597                 } else {
 5598                         cache_fpl_smr_exit(fpl);
 5599                         cache_fpl_handled_error(fpl, error);
 5600                 }
 5601                 break;
 5602         }
 5603         return (error);
 5604 }
 5605 
 5606 static int
 5607 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 5608 {
 5609         struct nameidata *ndp;
 5610         struct componentname *cnp;
 5611         struct mount *mp;
 5612         int error;
 5613 
 5614         ndp = fpl->ndp;
 5615         cnp = fpl->cnp;
 5616 
 5617         cache_fpl_checkpoint(fpl);
 5618 
 5619         /*
 5620          * The vnode at hand is almost always stable, skip checking for it.
 5621          * Worst case this postpones the check towards the end of the iteration
 5622          * of the main loop.
 5623          */
 5624         fpl->dvp = dvp;
 5625         fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
 5626 
 5627         mp = atomic_load_ptr(&dvp->v_mount);
 5628         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
 5629                 return (cache_fpl_aborted(fpl));
 5630         }
 5631 
 5632         MPASS(fpl->tvp == NULL);
 5633 
 5634         for (;;) {
 5635                 cache_fplookup_parse(fpl);
 5636 
 5637                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
 5638                 if (__predict_false(error != 0)) {
 5639                         error = cache_fplookup_failed_vexec(fpl, error);
 5640                         break;
 5641                 }
 5642 
 5643                 error = cache_fplookup_next(fpl);
 5644                 if (__predict_false(cache_fpl_terminated(fpl))) {
 5645                         break;
 5646                 }
 5647 
 5648                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 5649 
 5650                 if (fpl->tvp->v_type == VLNK) {
 5651                         error = cache_fplookup_symlink(fpl);
 5652                         if (cache_fpl_terminated(fpl)) {
 5653                                 break;
 5654                         }
 5655                 } else {
 5656                         if (cache_fpl_islastcn(ndp)) {
 5657                                 error = cache_fplookup_final(fpl);
 5658                                 break;
 5659                         }
 5660 
 5661                         if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
 5662                                 error = cache_fpl_aborted(fpl);
 5663                                 break;
 5664                         }
 5665 
 5666                         fpl->dvp = fpl->tvp;
 5667                         fpl->dvp_seqc = fpl->tvp_seqc;
 5668                         cache_fplookup_parse_advance(fpl);
 5669                 }
 5670 
 5671                 cache_fpl_checkpoint(fpl);
 5672         }
 5673 
 5674         return (error);
 5675 }
 5676 
 5677 /*
 5678  * Fast path lookup protected with SMR and sequence counters.
 5679  *
 5680  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
 5681  *
 5682  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
 5683  * outlined below.
 5684  *
 5685  * Traditional vnode lookup conceptually looks like this:
 5686  *
 5687  * vn_lock(current);
 5688  * for (;;) {
 5689  *      next = find();
 5690  *      vn_lock(next);
 5691  *      vn_unlock(current);
 5692  *      current = next;
 5693  *      if (last)
 5694  *          break;
 5695  * }
 5696  * return (current);
 5697  *
 5698  * Each jump to the next vnode is safe memory-wise and atomic with respect to
 5699  * any modifications thanks to holding respective locks.
 5700  *
 5701  * The same guarantee can be provided with a combination of safe memory
 5702  * reclamation and sequence counters instead. If all operations which affect
 5703  * the relationship between the current vnode and the one we are looking for
 5704  * also modify the counter, we can verify whether all the conditions held as
 5705  * we made the jump. This includes things like permissions, mount points etc.
 5706  * Counter modification is provided by enclosing relevant places in
 5707  * vn_seqc_write_begin()/end() calls.
 5708  *
 5709  * Thus this translates to:
 5710  *
 5711  * vfs_smr_enter();
 5712  * dvp_seqc = seqc_read_any(dvp);
 5713  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
 5714  *     abort();
 5715  * for (;;) {
 5716  *      tvp = find();
 5717  *      tvp_seqc = seqc_read_any(tvp);
 5718  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
 5719  *          abort();
 5720  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
 5721  *          abort();
 5722  *      dvp = tvp; // we know nothing of importance has changed
 5723  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
 5724  *      if (last)
 5725  *          break;
 5726  * }
 5727  * vget(); // secure the vnode
 5728  * if (!seqc_consistent(tvp, tvp_seqc) // final check
 5729  *          abort();
 5730  * // at this point we know nothing has changed for any parent<->child pair
 5731  * // as they were crossed during the lookup, meaning we matched the guarantee
 5732  * // of the locked variant
 5733  * return (tvp);
 5734  *
 5735  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
 5736  * - they are called while within vfs_smr protection which they must never exit
 5737  * - EAGAIN can be returned to denote checking could not be performed, it is
 5738  *   always valid to return it
 5739  * - if the sequence counter has not changed the result must be valid
 5740  * - if the sequence counter has changed both false positives and false negatives
 5741  *   are permitted (since the result will be rejected later)
 5742  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
 5743  *
 5744  * Caveats to watch out for:
 5745  * - vnodes are passed unlocked and unreferenced with nothing stopping
 5746  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
 5747  *   to use atomic_load_ptr to fetch it.
 5748  * - the aforementioned object can also get freed, meaning absent other means it
 5749  *   should be protected with vfs_smr
 5750  * - either safely checking permissions as they are modified or guaranteeing
 5751  *   their stability is left to the routine
 5752  */
 5753 int
 5754 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
 5755     struct pwd **pwdp)
 5756 {
 5757         struct cache_fpl fpl;
 5758         struct pwd *pwd;
 5759         struct vnode *dvp;
 5760         struct componentname *cnp;
 5761         int error;
 5762 
 5763         fpl.status = CACHE_FPL_STATUS_UNSET;
 5764         fpl.in_smr = false;
 5765         fpl.ndp = ndp;
 5766         fpl.cnp = cnp = &ndp->ni_cnd;
 5767         MPASS(ndp->ni_lcf == 0);
 5768         MPASS(curthread == cnp->cn_thread);
 5769         KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 5770             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
 5771             cnp->cn_flags));
 5772         if ((cnp->cn_flags & SAVESTART) != 0) {
 5773                 MPASS(cnp->cn_nameiop != LOOKUP);
 5774         }
 5775         MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
 5776 
 5777         if (__predict_false(!cache_can_fplookup(&fpl))) {
 5778                 *status = fpl.status;
 5779                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 5780                 return (EOPNOTSUPP);
 5781         }
 5782 
 5783         cache_fpl_checkpoint_outer(&fpl);
 5784 
 5785         cache_fpl_smr_enter_initial(&fpl);
 5786 #ifdef INVARIANTS
 5787         fpl.debug.ni_pathlen = ndp->ni_pathlen;
 5788 #endif
 5789         fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 5790         fpl.fsearch = false;
 5791         fpl.savename = (cnp->cn_flags & SAVENAME) != 0;
 5792         fpl.tvp = NULL; /* for degenerate path handling */
 5793         fpl.pwd = pwdp;
 5794         pwd = pwd_get_smr();
 5795         *(fpl.pwd) = pwd;
 5796         ndp->ni_rootdir = pwd->pwd_rdir;
 5797         ndp->ni_topdir = pwd->pwd_jdir;
 5798 
 5799         if (cnp->cn_pnbuf[0] == '/') {
 5800                 dvp = cache_fpl_handle_root(&fpl);
 5801                 MPASS(ndp->ni_resflags == 0);
 5802                 ndp->ni_resflags = NIRES_ABS;
 5803         } else {
 5804                 if (ndp->ni_dirfd == AT_FDCWD) {
 5805                         dvp = pwd->pwd_cdir;
 5806                 } else {
 5807                         error = cache_fplookup_dirfd(&fpl, &dvp);
 5808                         if (__predict_false(error != 0)) {
 5809                                 goto out;
 5810                         }
 5811                 }
 5812         }
 5813 
 5814         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
 5815         error = cache_fplookup_impl(dvp, &fpl);
 5816 out:
 5817         cache_fpl_smr_assert_not_entered(&fpl);
 5818         cache_fpl_assert_status(&fpl);
 5819         *status = fpl.status;
 5820         if (SDT_PROBES_ENABLED()) {
 5821                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 5822                 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
 5823                         SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
 5824                             ndp);
 5825         }
 5826 
 5827         if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 5828                 MPASS(error != CACHE_FPL_FAILED);
 5829                 if (error != 0) {
 5830                         MPASS(fpl.dvp == NULL);
 5831                         MPASS(fpl.tvp == NULL);
 5832                         MPASS(fpl.savename == false);
 5833                 }
 5834                 ndp->ni_dvp = fpl.dvp;
 5835                 ndp->ni_vp = fpl.tvp;
 5836                 if (fpl.savename) {
 5837                         cnp->cn_flags |= HASBUF;
 5838                 } else {
 5839                         cache_fpl_cleanup_cnp(cnp);
 5840                 }
 5841         }
 5842         return (error);
 5843 }

Cache object: 7a40385ffa82805219266f353f9299e9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.