The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993, 1995
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Poul-Henning Kamp of the FreeBSD Project.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_ddb.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/capsicum.h>
   46 #include <sys/counter.h>
   47 #include <sys/filedesc.h>
   48 #include <sys/fnv_hash.h>
   49 #include <sys/kernel.h>
   50 #include <sys/ktr.h>
   51 #include <sys/lock.h>
   52 #include <sys/malloc.h>
   53 #include <sys/fcntl.h>
   54 #include <sys/jail.h>
   55 #include <sys/mount.h>
   56 #include <sys/namei.h>
   57 #include <sys/proc.h>
   58 #include <sys/seqc.h>
   59 #include <sys/sdt.h>
   60 #include <sys/smr.h>
   61 #include <sys/smp.h>
   62 #include <sys/syscallsubr.h>
   63 #include <sys/sysctl.h>
   64 #include <sys/sysproto.h>
   65 #include <sys/vnode.h>
   66 #include <ck_queue.h>
   67 #ifdef KTRACE
   68 #include <sys/ktrace.h>
   69 #endif
   70 #ifdef INVARIANTS
   71 #include <machine/_inttypes.h>
   72 #endif
   73 
   74 #include <sys/capsicum.h>
   75 
   76 #include <security/audit/audit.h>
   77 #include <security/mac/mac_framework.h>
   78 
   79 #ifdef DDB
   80 #include <ddb/ddb.h>
   81 #endif
   82 
   83 #include <vm/uma.h>
   84 
   85 /*
   86  * High level overview of name caching in the VFS layer.
   87  *
   88  * Originally caching was implemented as part of UFS, later extracted to allow
   89  * use by other filesystems. A decision was made to make it optional and
   90  * completely detached from the rest of the kernel, which comes with limitations
   91  * outlined near the end of this comment block.
   92  *
   93  * This fundamental choice needs to be revisited. In the meantime, the current
   94  * state is described below. Significance of all notable routines is explained
   95  * in comments placed above their implementation. Scattered thoroughout the
   96  * file are TODO comments indicating shortcomings which can be fixed without
   97  * reworking everything (most of the fixes will likely be reusable). Various
   98  * details are omitted from this explanation to not clutter the overview, they
   99  * have to be checked by reading the code and associated commentary.
  100  *
  101  * Keep in mind that it's individual path components which are cached, not full
  102  * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
  103  * one for each name.
  104  *
  105  * I. Data organization
  106  *
  107  * Entries are described by "struct namecache" objects and stored in a hash
  108  * table. See cache_get_hash for more information.
  109  *
  110  * "struct vnode" contains pointers to source entries (names which can be found
  111  * when traversing through said vnode), destination entries (names of that
  112  * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
  113  * the parent vnode.
  114  *
  115  * The (directory vnode; name) tuple reliably determines the target entry if
  116  * it exists.
  117  *
  118  * Since there are no small locks at this time (all are 32 bytes in size on
  119  * LP64), the code works around the problem by introducing lock arrays to
  120  * protect hash buckets and vnode lists.
  121  *
  122  * II. Filesystem integration
  123  *
  124  * Filesystems participating in name caching do the following:
  125  * - set vop_lookup routine to vfs_cache_lookup
  126  * - set vop_cachedlookup to whatever can perform the lookup if the above fails
  127  * - if they support lockless lookup (see below), vop_fplookup_vexec and
  128  *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
  129  *   mount point
  130  * - call cache_purge or cache_vop_* routines to eliminate stale entries as
  131  *   applicable
  132  * - call cache_enter to add entries depending on the MAKEENTRY flag
  133  *
  134  * With the above in mind, there are 2 entry points when doing lookups:
  135  * - ... -> namei -> cache_fplookup -- this is the default
  136  * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
  137  *   should the above fail
  138  *
  139  * Example code flow how an entry is added:
  140  * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
  141  * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
  142  *
  143  * III. Performance considerations
  144  *
  145  * For lockless case forward lookup avoids any writes to shared areas apart
  146  * from the terminal path component. In other words non-modifying lookups of
  147  * different files don't suffer any scalability problems in the namecache.
  148  * Looking up the same file is limited by VFS and goes beyond the scope of this
  149  * file.
  150  *
  151  * At least on amd64 the single-threaded bottleneck for long paths is hashing
  152  * (see cache_get_hash). There are cases where the code issues acquire fence
  153  * multiple times, they can be combined on architectures which suffer from it.
  154  *
  155  * For locked case each encountered vnode has to be referenced and locked in
  156  * order to be handed out to the caller (normally that's namei). This
  157  * introduces significant hit single-threaded and serialization multi-threaded.
  158  *
  159  * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
  160  * avoids any writes to shared areas to any components.
  161  *
  162  * Unrelated insertions are partially serialized on updating the global entry
  163  * counter and possibly serialized on colliding bucket or vnode locks.
  164  *
  165  * IV. Observability
  166  *
  167  * Note not everything has an explicit dtrace probe nor it should have, thus
  168  * some of the one-liners below depend on implementation details.
  169  *
  170  * Examples:
  171  *
  172  * # Check what lookups failed to be handled in a lockless manner. Column 1 is
  173  * # line number, column 2 is status code (see cache_fpl_status)
  174  * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
  175  *
  176  * # Lengths of names added by binary name
  177  * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
  178  *
  179  * # Same as above but only those which exceed 64 characters
  180  * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
  181  *
  182  * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
  183  * # path is it
  184  * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
  185  *
  186  * V. Limitations and implementation defects
  187  *
  188  * - since it is possible there is no entry for an open file, tools like
  189  *   "procstat" may fail to resolve fd -> vnode -> path to anything
  190  * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
  191  *   shortage) in which case the above problem applies
  192  * - hardlinks are not tracked, thus if a vnode is reachable in more than one
  193  *   way, resolving a name may return a different path than the one used to
  194  *   open it (even if said path is still valid)
  195  * - by default entries are not added for newly created files
  196  * - adding an entry may need to evict negative entry first, which happens in 2
  197  *   distinct places (evicting on lookup, adding in a later VOP) making it
  198  *   impossible to simply reuse it
  199  * - there is a simple scheme to evict negative entries as the cache is approaching
  200  *   its capacity, but it is very unclear if doing so is a good idea to begin with
  201  * - vnodes are subject to being recycled even if target inode is left in memory,
  202  *   which loses the name cache entries when it perhaps should not. in case of tmpfs
  203  *   names get duplicated -- kept by filesystem itself and namecache separately
  204  * - struct namecache has a fixed size and comes in 2 variants, often wasting space.
  205  *   now hard to replace with malloc due to dependence on SMR.
  206  * - lack of better integration with the kernel also turns nullfs into a layered
  207  *   filesystem instead of something which can take advantage of caching
  208  */
  209 
  210 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  211     "Name cache");
  212 
  213 SDT_PROVIDER_DECLARE(vfs);
  214 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  215     "struct vnode *");
  216 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  217     "struct vnode *");
  218 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  219     "char *");
  220 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  221     "const char *");
  222 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  223     "struct namecache *", "int", "int");
  224 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  225 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  226     "char *", "struct vnode *");
  227 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  228 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  229     "struct vnode *", "char *");
  230 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  231     "struct vnode *");
  232 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
  233     "struct vnode *", "char *");
  234 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
  235     "char *");
  236 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
  237     "struct componentname *");
  238 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
  239     "struct componentname *");
  240 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
  241 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
  242 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
  243 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
  244 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
  245     "struct vnode *");
  246 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
  247     "char *");
  248 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
  249     "char *");
  250 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
  251 
  252 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
  253 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
  254 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
  255 
  256 static char __read_frequently cache_fast_lookup_enabled = true;
  257 
  258 /*
  259  * This structure describes the elements in the cache of recent
  260  * names looked up by namei.
  261  */
  262 struct negstate {
  263         u_char neg_flag;
  264         u_char neg_hit;
  265 };
  266 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
  267     "the state must fit in a union with a pointer without growing it");
  268 
  269 struct  namecache {
  270         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  271         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  272         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
  273         struct  vnode *nc_dvp;          /* vnode of parent of name */
  274         union {
  275                 struct  vnode *nu_vp;   /* vnode the name refers to */
  276                 struct  negstate nu_neg;/* negative entry state */
  277         } n_un;
  278         u_char  nc_flag;                /* flag bits */
  279         u_char  nc_nlen;                /* length of name */
  280         char    nc_name[];              /* segment name + nul */
  281 };
  282 
  283 /*
  284  * struct namecache_ts repeats struct namecache layout up to the
  285  * nc_nlen member.
  286  * struct namecache_ts is used in place of struct namecache when time(s) need
  287  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  288  * both a non-dotdot directory name plus dotdot for the directory's
  289  * parent.
  290  *
  291  * See below for alignment requirement.
  292  */
  293 struct  namecache_ts {
  294         struct  timespec nc_time;       /* timespec provided by fs */
  295         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
  296         int     nc_ticks;               /* ticks value when entry was added */
  297         int     nc_pad;
  298         struct namecache nc_nc;
  299 };
  300 
  301 TAILQ_HEAD(cache_freebatch, namecache);
  302 
  303 /*
  304  * At least mips n32 performs 64-bit accesses to timespec as found
  305  * in namecache_ts and requires them to be aligned. Since others
  306  * may be in the same spot suffer a little bit and enforce the
  307  * alignment for everyone. Note this is a nop for 64-bit platforms.
  308  */
  309 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
  310 
  311 /*
  312  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
  313  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
  314  * smaller and the value was bumped to retain the total size, but it
  315  * was never re-evaluated for suitability. A simple test counting
  316  * lengths during package building shows that the value of 45 covers
  317  * about 86% of all added entries, reaching 99% at 65.
  318  *
  319  * Regardless of the above, use of dedicated zones instead of malloc may be
  320  * inducing additional waste. This may be hard to address as said zones are
  321  * tied to VFS SMR. Even if retaining them, the current split should be
  322  * re-evaluated.
  323  */
  324 #ifdef __LP64__
  325 #define CACHE_PATH_CUTOFF       45
  326 #define CACHE_LARGE_PAD         6
  327 #else
  328 #define CACHE_PATH_CUTOFF       41
  329 #define CACHE_LARGE_PAD         2
  330 #endif
  331 
  332 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
  333 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
  334 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
  335 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
  336 
  337 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  338 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  339 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  340 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  341 
  342 #define nc_vp           n_un.nu_vp
  343 #define nc_neg          n_un.nu_neg
  344 
  345 /*
  346  * Flags in namecache.nc_flag
  347  */
  348 #define NCF_WHITE       0x01
  349 #define NCF_ISDOTDOT    0x02
  350 #define NCF_TS          0x04
  351 #define NCF_DTS         0x08
  352 #define NCF_DVDROP      0x10
  353 #define NCF_NEGATIVE    0x20
  354 #define NCF_INVALID     0x40
  355 #define NCF_WIP         0x80
  356 
  357 /*
  358  * Flags in negstate.neg_flag
  359  */
  360 #define NEG_HOT         0x01
  361 
  362 static bool     cache_neg_evict_cond(u_long lnumcache);
  363 
  364 /*
  365  * Mark an entry as invalid.
  366  *
  367  * This is called before it starts getting deconstructed.
  368  */
  369 static void
  370 cache_ncp_invalidate(struct namecache *ncp)
  371 {
  372 
  373         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
  374             ("%s: entry %p already invalid", __func__, ncp));
  375         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
  376         atomic_thread_fence_rel();
  377 }
  378 
  379 /*
  380  * Check whether the entry can be safely used.
  381  *
  382  * All places which elide locks are supposed to call this after they are
  383  * done with reading from an entry.
  384  */
  385 #define cache_ncp_canuse(ncp)   ({                                      \
  386         struct namecache *_ncp = (ncp);                                 \
  387         u_char _nc_flag;                                                \
  388                                                                         \
  389         atomic_thread_fence_acq();                                      \
  390         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  391         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
  392 })
  393 
  394 /*
  395  * Like the above but also checks NCF_WHITE.
  396  */
  397 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
  398         struct namecache *_ncp = (ncp);                                 \
  399         u_char _nc_flag;                                                \
  400                                                                         \
  401         atomic_thread_fence_acq();                                      \
  402         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  403         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
  404 })
  405 
  406 VFS_SMR_DECLARE;
  407 
  408 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  409     "Name cache parameters");
  410 
  411 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
  412 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
  413     "Total namecache capacity");
  414 
  415 u_int ncsizefactor = 2;
  416 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
  417     "Size factor for namecache");
  418 
  419 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
  420 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
  421     "Ratio of negative namecache entries");
  422 
  423 /*
  424  * Negative entry % of namecache capacity above which automatic eviction is allowed.
  425  *
  426  * Check cache_neg_evict_cond for details.
  427  */
  428 static u_int ncnegminpct = 3;
  429 
  430 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
  431 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
  432     "Negative entry count above which automatic eviction is allowed");
  433 
  434 /*
  435  * Structures associated with name caching.
  436  */
  437 #define NCHHASH(hash) \
  438         (&nchashtbl[(hash) & nchash])
  439 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
  440 static u_long __read_mostly     nchash;                 /* size of hash table */
  441 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  442     "Size of namecache hash table");
  443 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
  444 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
  445 
  446 struct nchstats nchstats;               /* cache effectiveness statistics */
  447 
  448 static u_int __exclusive_cache_line neg_cycle;
  449 
  450 #define ncneghash       3
  451 #define numneglists     (ncneghash + 1)
  452 
  453 struct neglist {
  454         struct mtx              nl_evict_lock;
  455         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
  456         TAILQ_HEAD(, namecache) nl_list;
  457         TAILQ_HEAD(, namecache) nl_hotlist;
  458         u_long                  nl_hotnum;
  459 } __aligned(CACHE_LINE_SIZE);
  460 
  461 static struct neglist neglists[numneglists];
  462 
  463 static inline struct neglist *
  464 NCP2NEGLIST(struct namecache *ncp)
  465 {
  466 
  467         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
  468 }
  469 
  470 static inline struct negstate *
  471 NCP2NEGSTATE(struct namecache *ncp)
  472 {
  473 
  474         MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
  475         return (&ncp->nc_neg);
  476 }
  477 
  478 #define numbucketlocks (ncbuckethash + 1)
  479 static u_int __read_mostly  ncbuckethash;
  480 static struct mtx_padalign __read_mostly  *bucketlocks;
  481 #define HASH2BUCKETLOCK(hash) \
  482         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
  483 
  484 #define numvnodelocks (ncvnodehash + 1)
  485 static u_int __read_mostly  ncvnodehash;
  486 static struct mtx __read_mostly *vnodelocks;
  487 static inline struct mtx *
  488 VP2VNODELOCK(struct vnode *vp)
  489 {
  490 
  491         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
  492 }
  493 
  494 static void
  495 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
  496 {
  497         struct namecache_ts *ncp_ts;
  498 
  499         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
  500             (tsp == NULL && ticksp == NULL),
  501             ("No NCF_TS"));
  502 
  503         if (tsp == NULL)
  504                 return;
  505 
  506         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  507         *tsp = ncp_ts->nc_time;
  508         *ticksp = ncp_ts->nc_ticks;
  509 }
  510 
  511 #ifdef DEBUG_CACHE
  512 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
  513 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
  514     "VFS namecache enabled");
  515 #endif
  516 
  517 /* Export size information to userland */
  518 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
  519     sizeof(struct namecache), "sizeof(struct namecache)");
  520 
  521 /*
  522  * The new name cache statistics
  523  */
  524 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  525     "Name cache statistics");
  526 
  527 #define STATNODE_ULONG(name, varname, descr)                                    \
  528         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  529 #define STATNODE_COUNTER(name, varname, descr)                                  \
  530         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  531         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
  532             descr);
  533 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
  534 STATNODE_ULONG(count, numcache, "Number of cache entries");
  535 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
  536 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
  537 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
  538 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
  539 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
  540 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
  541 STATNODE_COUNTER(posszaps, numposzaps,
  542     "Number of cache hits (positive) we do not want to cache");
  543 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
  544 STATNODE_COUNTER(negzaps, numnegzaps,
  545     "Number of cache hits (negative) we do not want to cache");
  546 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
  547 /* These count for vn_getcwd(), too. */
  548 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
  549 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
  550 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
  551     "Number of fullpath search errors (VOP_VPTOCNP failures)");
  552 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
  553 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
  554 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
  555 
  556 /*
  557  * Debug or developer statistics.
  558  */
  559 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  560     "Name cache debugging");
  561 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
  562         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  563 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
  564         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  565         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
  566             descr);
  567 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
  568     "Number of successful removals after relocking");
  569 static long zap_bucket_fail;
  570 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
  571 static long zap_bucket_fail2;
  572 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
  573 static long cache_lock_vnodes_cel_3_failures;
  574 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
  575     "Number of times 3-way vnode locking failed");
  576 
  577 static void cache_zap_locked(struct namecache *ncp);
  578 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
  579     char **retbuf, size_t *buflen, size_t addend);
  580 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
  581     char **retbuf, size_t *buflen);
  582 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
  583     char **retbuf, size_t *len, size_t addend);
  584 
  585 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  586 
  587 static inline void
  588 cache_assert_vlp_locked(struct mtx *vlp)
  589 {
  590 
  591         if (vlp != NULL)
  592                 mtx_assert(vlp, MA_OWNED);
  593 }
  594 
  595 static inline void
  596 cache_assert_vnode_locked(struct vnode *vp)
  597 {
  598         struct mtx *vlp;
  599 
  600         vlp = VP2VNODELOCK(vp);
  601         cache_assert_vlp_locked(vlp);
  602 }
  603 
  604 /*
  605  * Directory vnodes with entries are held for two reasons:
  606  * 1. make them less of a target for reclamation in vnlru
  607  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
  608  *
  609  * It will be feasible to stop doing it altogether if all filesystems start
  610  * supporting lockless lookup.
  611  */
  612 static void
  613 cache_hold_vnode(struct vnode *vp)
  614 {
  615 
  616         cache_assert_vnode_locked(vp);
  617         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
  618         vhold(vp);
  619         counter_u64_add(numcachehv, 1);
  620 }
  621 
  622 static void
  623 cache_drop_vnode(struct vnode *vp)
  624 {
  625 
  626         /*
  627          * Called after all locks are dropped, meaning we can't assert
  628          * on the state of v_cache_src.
  629          */
  630         vdrop(vp);
  631         counter_u64_add(numcachehv, -1);
  632 }
  633 
  634 /*
  635  * UMA zones.
  636  */
  637 static uma_zone_t __read_mostly cache_zone_small;
  638 static uma_zone_t __read_mostly cache_zone_small_ts;
  639 static uma_zone_t __read_mostly cache_zone_large;
  640 static uma_zone_t __read_mostly cache_zone_large_ts;
  641 
  642 char *
  643 cache_symlink_alloc(size_t size, int flags)
  644 {
  645 
  646         if (size < CACHE_ZONE_SMALL_SIZE) {
  647                 return (uma_zalloc_smr(cache_zone_small, flags));
  648         }
  649         if (size < CACHE_ZONE_LARGE_SIZE) {
  650                 return (uma_zalloc_smr(cache_zone_large, flags));
  651         }
  652         counter_u64_add(symlinktoobig, 1);
  653         SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
  654         return (NULL);
  655 }
  656 
  657 void
  658 cache_symlink_free(char *string, size_t size)
  659 {
  660 
  661         MPASS(string != NULL);
  662         KASSERT(size < CACHE_ZONE_LARGE_SIZE,
  663             ("%s: size %zu too big", __func__, size));
  664 
  665         if (size < CACHE_ZONE_SMALL_SIZE) {
  666                 uma_zfree_smr(cache_zone_small, string);
  667                 return;
  668         }
  669         if (size < CACHE_ZONE_LARGE_SIZE) {
  670                 uma_zfree_smr(cache_zone_large, string);
  671                 return;
  672         }
  673         __assert_unreachable();
  674 }
  675 
  676 static struct namecache *
  677 cache_alloc_uma(int len, bool ts)
  678 {
  679         struct namecache_ts *ncp_ts;
  680         struct namecache *ncp;
  681 
  682         if (__predict_false(ts)) {
  683                 if (len <= CACHE_PATH_CUTOFF)
  684                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
  685                 else
  686                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
  687                 ncp = &ncp_ts->nc_nc;
  688         } else {
  689                 if (len <= CACHE_PATH_CUTOFF)
  690                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
  691                 else
  692                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
  693         }
  694         return (ncp);
  695 }
  696 
  697 static void
  698 cache_free_uma(struct namecache *ncp)
  699 {
  700         struct namecache_ts *ncp_ts;
  701 
  702         if (__predict_false(ncp->nc_flag & NCF_TS)) {
  703                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  704                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  705                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
  706                 else
  707                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
  708         } else {
  709                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  710                         uma_zfree_smr(cache_zone_small, ncp);
  711                 else
  712                         uma_zfree_smr(cache_zone_large, ncp);
  713         }
  714 }
  715 
  716 static struct namecache *
  717 cache_alloc(int len, bool ts)
  718 {
  719         u_long lnumcache;
  720 
  721         /*
  722          * Avoid blowout in namecache entries.
  723          *
  724          * Bugs:
  725          * 1. filesystems may end up trying to add an already existing entry
  726          * (for example this can happen after a cache miss during concurrent
  727          * lookup), in which case we will call cache_neg_evict despite not
  728          * adding anything.
  729          * 2. the routine may fail to free anything and no provisions are made
  730          * to make it try harder (see the inside for failure modes)
  731          * 3. it only ever looks at negative entries.
  732          */
  733         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
  734         if (cache_neg_evict_cond(lnumcache)) {
  735                 lnumcache = atomic_load_long(&numcache);
  736         }
  737         if (__predict_false(lnumcache >= ncsize)) {
  738                 atomic_subtract_long(&numcache, 1);
  739                 counter_u64_add(numdrops, 1);
  740                 return (NULL);
  741         }
  742         return (cache_alloc_uma(len, ts));
  743 }
  744 
  745 static void
  746 cache_free(struct namecache *ncp)
  747 {
  748 
  749         MPASS(ncp != NULL);
  750         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  751                 cache_drop_vnode(ncp->nc_dvp);
  752         }
  753         cache_free_uma(ncp);
  754         atomic_subtract_long(&numcache, 1);
  755 }
  756 
  757 static void
  758 cache_free_batch(struct cache_freebatch *batch)
  759 {
  760         struct namecache *ncp, *nnp;
  761         int i;
  762 
  763         i = 0;
  764         if (TAILQ_EMPTY(batch))
  765                 goto out;
  766         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
  767                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  768                         cache_drop_vnode(ncp->nc_dvp);
  769                 }
  770                 cache_free_uma(ncp);
  771                 i++;
  772         }
  773         atomic_subtract_long(&numcache, i);
  774 out:
  775         SDT_PROBE1(vfs, namecache, purge, batch, i);
  776 }
  777 
  778 /*
  779  * Hashing.
  780  *
  781  * The code was made to use FNV in 2001 and this choice needs to be revisited.
  782  *
  783  * Short summary of the difficulty:
  784  * The longest name which can be inserted is NAME_MAX characters in length (or
  785  * 255 at the time of writing this comment), while majority of names used in
  786  * practice are significantly shorter (mostly below 10). More importantly
  787  * majority of lookups performed find names are even shorter than that.
  788  *
  789  * This poses a problem where hashes which do better than FNV past word size
  790  * (or so) tend to come with additional overhead when finalizing the result,
  791  * making them noticeably slower for the most commonly used range.
  792  *
  793  * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
  794  *
  795  * When looking it up the most time consuming part by a large margin (at least
  796  * on amd64) is hashing.  Replacing FNV with something which pessimizes short
  797  * input would make the slowest part stand out even more.
  798  */
  799 
  800 /*
  801  * TODO: With the value stored we can do better than computing the hash based
  802  * on the address.
  803  */
  804 static void
  805 cache_prehash(struct vnode *vp)
  806 {
  807 
  808         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
  809 }
  810 
  811 static uint32_t
  812 cache_get_hash(char *name, u_char len, struct vnode *dvp)
  813 {
  814 
  815         return (fnv_32_buf(name, len, dvp->v_nchash));
  816 }
  817 
  818 static uint32_t
  819 cache_get_hash_iter_start(struct vnode *dvp)
  820 {
  821 
  822         return (dvp->v_nchash);
  823 }
  824 
  825 static uint32_t
  826 cache_get_hash_iter(char c, uint32_t hash)
  827 {
  828 
  829         return (fnv_32_buf(&c, 1, hash));
  830 }
  831 
  832 static uint32_t
  833 cache_get_hash_iter_finish(uint32_t hash)
  834 {
  835 
  836         return (hash);
  837 }
  838 
  839 static inline struct nchashhead *
  840 NCP2BUCKET(struct namecache *ncp)
  841 {
  842         uint32_t hash;
  843 
  844         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  845         return (NCHHASH(hash));
  846 }
  847 
  848 static inline struct mtx *
  849 NCP2BUCKETLOCK(struct namecache *ncp)
  850 {
  851         uint32_t hash;
  852 
  853         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  854         return (HASH2BUCKETLOCK(hash));
  855 }
  856 
  857 #ifdef INVARIANTS
  858 static void
  859 cache_assert_bucket_locked(struct namecache *ncp)
  860 {
  861         struct mtx *blp;
  862 
  863         blp = NCP2BUCKETLOCK(ncp);
  864         mtx_assert(blp, MA_OWNED);
  865 }
  866 
  867 static void
  868 cache_assert_bucket_unlocked(struct namecache *ncp)
  869 {
  870         struct mtx *blp;
  871 
  872         blp = NCP2BUCKETLOCK(ncp);
  873         mtx_assert(blp, MA_NOTOWNED);
  874 }
  875 #else
  876 #define cache_assert_bucket_locked(x) do { } while (0)
  877 #define cache_assert_bucket_unlocked(x) do { } while (0)
  878 #endif
  879 
  880 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
  881 static void
  882 _cache_sort_vnodes(void **p1, void **p2)
  883 {
  884         void *tmp;
  885 
  886         MPASS(*p1 != NULL || *p2 != NULL);
  887 
  888         if (*p1 > *p2) {
  889                 tmp = *p2;
  890                 *p2 = *p1;
  891                 *p1 = tmp;
  892         }
  893 }
  894 
  895 static void
  896 cache_lock_all_buckets(void)
  897 {
  898         u_int i;
  899 
  900         for (i = 0; i < numbucketlocks; i++)
  901                 mtx_lock(&bucketlocks[i]);
  902 }
  903 
  904 static void
  905 cache_unlock_all_buckets(void)
  906 {
  907         u_int i;
  908 
  909         for (i = 0; i < numbucketlocks; i++)
  910                 mtx_unlock(&bucketlocks[i]);
  911 }
  912 
  913 static void
  914 cache_lock_all_vnodes(void)
  915 {
  916         u_int i;
  917 
  918         for (i = 0; i < numvnodelocks; i++)
  919                 mtx_lock(&vnodelocks[i]);
  920 }
  921 
  922 static void
  923 cache_unlock_all_vnodes(void)
  924 {
  925         u_int i;
  926 
  927         for (i = 0; i < numvnodelocks; i++)
  928                 mtx_unlock(&vnodelocks[i]);
  929 }
  930 
  931 static int
  932 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  933 {
  934 
  935         cache_sort_vnodes(&vlp1, &vlp2);
  936 
  937         if (vlp1 != NULL) {
  938                 if (!mtx_trylock(vlp1))
  939                         return (EAGAIN);
  940         }
  941         if (!mtx_trylock(vlp2)) {
  942                 if (vlp1 != NULL)
  943                         mtx_unlock(vlp1);
  944                 return (EAGAIN);
  945         }
  946 
  947         return (0);
  948 }
  949 
  950 static void
  951 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  952 {
  953 
  954         MPASS(vlp1 != NULL || vlp2 != NULL);
  955         MPASS(vlp1 <= vlp2);
  956 
  957         if (vlp1 != NULL)
  958                 mtx_lock(vlp1);
  959         if (vlp2 != NULL)
  960                 mtx_lock(vlp2);
  961 }
  962 
  963 static void
  964 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  965 {
  966 
  967         MPASS(vlp1 != NULL || vlp2 != NULL);
  968 
  969         if (vlp1 != NULL)
  970                 mtx_unlock(vlp1);
  971         if (vlp2 != NULL)
  972                 mtx_unlock(vlp2);
  973 }
  974 
  975 static int
  976 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  977 {
  978         struct nchstats snap;
  979 
  980         if (req->oldptr == NULL)
  981                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
  982 
  983         snap = nchstats;
  984         snap.ncs_goodhits = counter_u64_fetch(numposhits);
  985         snap.ncs_neghits = counter_u64_fetch(numneghits);
  986         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
  987             counter_u64_fetch(numnegzaps);
  988         snap.ncs_miss = counter_u64_fetch(nummisszap) +
  989             counter_u64_fetch(nummiss);
  990 
  991         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
  992 }
  993 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
  994     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
  995     "VFS cache effectiveness statistics");
  996 
  997 static void
  998 cache_recalc_neg_min(u_int val)
  999 {
 1000 
 1001         neg_min = (ncsize * val) / 100;
 1002 }
 1003 
 1004 static int
 1005 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 1006 {
 1007         u_int val;
 1008         int error;
 1009 
 1010         val = ncnegminpct;
 1011         error = sysctl_handle_int(oidp, &val, 0, req);
 1012         if (error != 0 || req->newptr == NULL)
 1013                 return (error);
 1014 
 1015         if (val == ncnegminpct)
 1016                 return (0);
 1017         if (val < 0 || val > 99)
 1018                 return (EINVAL);
 1019         ncnegminpct = val;
 1020         cache_recalc_neg_min(val);
 1021         return (0);
 1022 }
 1023 
 1024 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 1025     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 1026     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 1027 
 1028 #ifdef DEBUG_CACHE
 1029 /*
 1030  * Grab an atomic snapshot of the name cache hash chain lengths
 1031  */
 1032 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 1033     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 1034     "hash table stats");
 1035 
 1036 static int
 1037 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 1038 {
 1039         struct nchashhead *ncpp;
 1040         struct namecache *ncp;
 1041         int i, error, n_nchash, *cntbuf;
 1042 
 1043 retry:
 1044         n_nchash = nchash + 1;  /* nchash is max index, not count */
 1045         if (req->oldptr == NULL)
 1046                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 1047         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 1048         cache_lock_all_buckets();
 1049         if (n_nchash != nchash + 1) {
 1050                 cache_unlock_all_buckets();
 1051                 free(cntbuf, M_TEMP);
 1052                 goto retry;
 1053         }
 1054         /* Scan hash tables counting entries */
 1055         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 1056                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 1057                         cntbuf[i]++;
 1058         cache_unlock_all_buckets();
 1059         for (error = 0, i = 0; i < n_nchash; i++)
 1060                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 1061                         break;
 1062         free(cntbuf, M_TEMP);
 1063         return (error);
 1064 }
 1065 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 1066     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 1067     "nchash chain lengths");
 1068 
 1069 static int
 1070 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 1071 {
 1072         int error;
 1073         struct nchashhead *ncpp;
 1074         struct namecache *ncp;
 1075         int n_nchash;
 1076         int count, maxlength, used, pct;
 1077 
 1078         if (!req->oldptr)
 1079                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 1080 
 1081         cache_lock_all_buckets();
 1082         n_nchash = nchash + 1;  /* nchash is max index, not count */
 1083         used = 0;
 1084         maxlength = 0;
 1085 
 1086         /* Scan hash tables for applicable entries */
 1087         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 1088                 count = 0;
 1089                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 1090                         count++;
 1091                 }
 1092                 if (count)
 1093                         used++;
 1094                 if (maxlength < count)
 1095                         maxlength = count;
 1096         }
 1097         n_nchash = nchash + 1;
 1098         cache_unlock_all_buckets();
 1099         pct = (used * 100) / (n_nchash / 100);
 1100         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 1101         if (error)
 1102                 return (error);
 1103         error = SYSCTL_OUT(req, &used, sizeof(used));
 1104         if (error)
 1105                 return (error);
 1106         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 1107         if (error)
 1108                 return (error);
 1109         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 1110         if (error)
 1111                 return (error);
 1112         return (0);
 1113 }
 1114 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 1115     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 1116     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 1117 #endif
 1118 
 1119 /*
 1120  * Negative entries management
 1121  *
 1122  * Various workloads create plenty of negative entries and barely use them
 1123  * afterwards. Moreover malicious users can keep performing bogus lookups
 1124  * adding even more entries. For example "make tinderbox" as of writing this
 1125  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 1126  * negative.
 1127  *
 1128  * As such, a rather aggressive eviction method is needed. The currently
 1129  * employed method is a placeholder.
 1130  *
 1131  * Entries are split over numneglists separate lists, each of which is further
 1132  * split into hot and cold entries. Entries get promoted after getting a hit.
 1133  * Eviction happens on addition of new entry.
 1134  */
 1135 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1136     "Name cache negative entry statistics");
 1137 
 1138 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 1139     "Number of negative cache entries");
 1140 
 1141 static COUNTER_U64_DEFINE_EARLY(neg_created);
 1142 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 1143     "Number of created negative entries");
 1144 
 1145 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 1146 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 1147     "Number of evicted negative entries");
 1148 
 1149 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 1150 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 1151     &neg_evict_skipped_empty,
 1152     "Number of times evicting failed due to lack of entries");
 1153 
 1154 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 1155 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 1156     &neg_evict_skipped_missed,
 1157     "Number of times evicting failed due to target entry disappearing");
 1158 
 1159 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 1160 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 1161     &neg_evict_skipped_contended,
 1162     "Number of times evicting failed due to contention");
 1163 
 1164 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 1165     "Number of cache hits (negative)");
 1166 
 1167 static int
 1168 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 1169 {
 1170         int i, out;
 1171 
 1172         out = 0;
 1173         for (i = 0; i < numneglists; i++)
 1174                 out += neglists[i].nl_hotnum;
 1175 
 1176         return (SYSCTL_OUT(req, &out, sizeof(out)));
 1177 }
 1178 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 1179     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 1180     "Number of hot negative entries");
 1181 
 1182 static void
 1183 cache_neg_init(struct namecache *ncp)
 1184 {
 1185         struct negstate *ns;
 1186 
 1187         ncp->nc_flag |= NCF_NEGATIVE;
 1188         ns = NCP2NEGSTATE(ncp);
 1189         ns->neg_flag = 0;
 1190         ns->neg_hit = 0;
 1191         counter_u64_add(neg_created, 1);
 1192 }
 1193 
 1194 #define CACHE_NEG_PROMOTION_THRESH 2
 1195 
 1196 static bool
 1197 cache_neg_hit_prep(struct namecache *ncp)
 1198 {
 1199         struct negstate *ns;
 1200         u_char n;
 1201 
 1202         ns = NCP2NEGSTATE(ncp);
 1203         n = atomic_load_char(&ns->neg_hit);
 1204         for (;;) {
 1205                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 1206                         return (false);
 1207                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 1208                         break;
 1209         }
 1210         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 1211 }
 1212 
 1213 /*
 1214  * Nothing to do here but it is provided for completeness as some
 1215  * cache_neg_hit_prep callers may end up returning without even
 1216  * trying to promote.
 1217  */
 1218 #define cache_neg_hit_abort(ncp)        do { } while (0)
 1219 
 1220 static void
 1221 cache_neg_hit_finish(struct namecache *ncp)
 1222 {
 1223 
 1224         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 1225         counter_u64_add(numneghits, 1);
 1226 }
 1227 
 1228 /*
 1229  * Move a negative entry to the hot list.
 1230  */
 1231 static void
 1232 cache_neg_promote_locked(struct namecache *ncp)
 1233 {
 1234         struct neglist *nl;
 1235         struct negstate *ns;
 1236 
 1237         ns = NCP2NEGSTATE(ncp);
 1238         nl = NCP2NEGLIST(ncp);
 1239         mtx_assert(&nl->nl_lock, MA_OWNED);
 1240         if ((ns->neg_flag & NEG_HOT) == 0) {
 1241                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1242                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 1243                 nl->nl_hotnum++;
 1244                 ns->neg_flag |= NEG_HOT;
 1245         }
 1246 }
 1247 
 1248 /*
 1249  * Move a hot negative entry to the cold list.
 1250  */
 1251 static void
 1252 cache_neg_demote_locked(struct namecache *ncp)
 1253 {
 1254         struct neglist *nl;
 1255         struct negstate *ns;
 1256 
 1257         ns = NCP2NEGSTATE(ncp);
 1258         nl = NCP2NEGLIST(ncp);
 1259         mtx_assert(&nl->nl_lock, MA_OWNED);
 1260         MPASS(ns->neg_flag & NEG_HOT);
 1261         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1262         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1263         nl->nl_hotnum--;
 1264         ns->neg_flag &= ~NEG_HOT;
 1265         atomic_store_char(&ns->neg_hit, 0);
 1266 }
 1267 
 1268 /*
 1269  * Move a negative entry to the hot list if it matches the lookup.
 1270  *
 1271  * We have to take locks, but they may be contended and in the worst
 1272  * case we may need to go off CPU. We don't want to spin within the
 1273  * smr section and we can't block with it. Exiting the section means
 1274  * the found entry could have been evicted. We are going to look it
 1275  * up again.
 1276  */
 1277 static bool
 1278 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
 1279     struct namecache *oncp, uint32_t hash)
 1280 {
 1281         struct namecache *ncp;
 1282         struct neglist *nl;
 1283         u_char nc_flag;
 1284 
 1285         nl = NCP2NEGLIST(oncp);
 1286 
 1287         mtx_lock(&nl->nl_lock);
 1288         /*
 1289          * For hash iteration.
 1290          */
 1291         vfs_smr_enter();
 1292 
 1293         /*
 1294          * Avoid all surprises by only succeeding if we got the same entry and
 1295          * bailing completely otherwise.
 1296          * XXX There are no provisions to keep the vnode around, meaning we may
 1297          * end up promoting a negative entry for a *new* vnode and returning
 1298          * ENOENT on its account. This is the error we want to return anyway
 1299          * and promotion is harmless.
 1300          *
 1301          * In particular at this point there can be a new ncp which matches the
 1302          * search but hashes to a different neglist.
 1303          */
 1304         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1305                 if (ncp == oncp)
 1306                         break;
 1307         }
 1308 
 1309         /*
 1310          * No match to begin with.
 1311          */
 1312         if (__predict_false(ncp == NULL)) {
 1313                 goto out_abort;
 1314         }
 1315 
 1316         /*
 1317          * The newly found entry may be something different...
 1318          */
 1319         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1320             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
 1321                 goto out_abort;
 1322         }
 1323 
 1324         /*
 1325          * ... and not even negative.
 1326          */
 1327         nc_flag = atomic_load_char(&ncp->nc_flag);
 1328         if ((nc_flag & NCF_NEGATIVE) == 0) {
 1329                 goto out_abort;
 1330         }
 1331 
 1332         if (!cache_ncp_canuse(ncp)) {
 1333                 goto out_abort;
 1334         }
 1335 
 1336         cache_neg_promote_locked(ncp);
 1337         cache_neg_hit_finish(ncp);
 1338         vfs_smr_exit();
 1339         mtx_unlock(&nl->nl_lock);
 1340         return (true);
 1341 out_abort:
 1342         vfs_smr_exit();
 1343         mtx_unlock(&nl->nl_lock);
 1344         return (false);
 1345 }
 1346 
 1347 static void
 1348 cache_neg_promote(struct namecache *ncp)
 1349 {
 1350         struct neglist *nl;
 1351 
 1352         nl = NCP2NEGLIST(ncp);
 1353         mtx_lock(&nl->nl_lock);
 1354         cache_neg_promote_locked(ncp);
 1355         mtx_unlock(&nl->nl_lock);
 1356 }
 1357 
 1358 static void
 1359 cache_neg_insert(struct namecache *ncp)
 1360 {
 1361         struct neglist *nl;
 1362 
 1363         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1364         cache_assert_bucket_locked(ncp);
 1365         nl = NCP2NEGLIST(ncp);
 1366         mtx_lock(&nl->nl_lock);
 1367         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1368         mtx_unlock(&nl->nl_lock);
 1369         atomic_add_long(&numneg, 1);
 1370 }
 1371 
 1372 static void
 1373 cache_neg_remove(struct namecache *ncp)
 1374 {
 1375         struct neglist *nl;
 1376         struct negstate *ns;
 1377 
 1378         cache_assert_bucket_locked(ncp);
 1379         nl = NCP2NEGLIST(ncp);
 1380         ns = NCP2NEGSTATE(ncp);
 1381         mtx_lock(&nl->nl_lock);
 1382         if ((ns->neg_flag & NEG_HOT) != 0) {
 1383                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1384                 nl->nl_hotnum--;
 1385         } else {
 1386                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1387         }
 1388         mtx_unlock(&nl->nl_lock);
 1389         atomic_subtract_long(&numneg, 1);
 1390 }
 1391 
 1392 static struct neglist *
 1393 cache_neg_evict_select_list(void)
 1394 {
 1395         struct neglist *nl;
 1396         u_int c;
 1397 
 1398         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
 1399         nl = &neglists[c % numneglists];
 1400         if (!mtx_trylock(&nl->nl_evict_lock)) {
 1401                 counter_u64_add(neg_evict_skipped_contended, 1);
 1402                 return (NULL);
 1403         }
 1404         return (nl);
 1405 }
 1406 
 1407 static struct namecache *
 1408 cache_neg_evict_select_entry(struct neglist *nl)
 1409 {
 1410         struct namecache *ncp, *lncp;
 1411         struct negstate *ns, *lns;
 1412         int i;
 1413 
 1414         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
 1415         mtx_assert(&nl->nl_lock, MA_OWNED);
 1416         ncp = TAILQ_FIRST(&nl->nl_list);
 1417         if (ncp == NULL)
 1418                 return (NULL);
 1419         lncp = ncp;
 1420         lns = NCP2NEGSTATE(lncp);
 1421         for (i = 1; i < 4; i++) {
 1422                 ncp = TAILQ_NEXT(ncp, nc_dst);
 1423                 if (ncp == NULL)
 1424                         break;
 1425                 ns = NCP2NEGSTATE(ncp);
 1426                 if (ns->neg_hit < lns->neg_hit) {
 1427                         lncp = ncp;
 1428                         lns = ns;
 1429                 }
 1430         }
 1431         return (lncp);
 1432 }
 1433 
 1434 static bool
 1435 cache_neg_evict(void)
 1436 {
 1437         struct namecache *ncp, *ncp2;
 1438         struct neglist *nl;
 1439         struct vnode *dvp;
 1440         struct mtx *dvlp;
 1441         struct mtx *blp;
 1442         uint32_t hash;
 1443         u_char nlen;
 1444         bool evicted;
 1445 
 1446         nl = cache_neg_evict_select_list();
 1447         if (nl == NULL) {
 1448                 return (false);
 1449         }
 1450 
 1451         mtx_lock(&nl->nl_lock);
 1452         ncp = TAILQ_FIRST(&nl->nl_hotlist);
 1453         if (ncp != NULL) {
 1454                 cache_neg_demote_locked(ncp);
 1455         }
 1456         ncp = cache_neg_evict_select_entry(nl);
 1457         if (ncp == NULL) {
 1458                 counter_u64_add(neg_evict_skipped_empty, 1);
 1459                 mtx_unlock(&nl->nl_lock);
 1460                 mtx_unlock(&nl->nl_evict_lock);
 1461                 return (false);
 1462         }
 1463         nlen = ncp->nc_nlen;
 1464         dvp = ncp->nc_dvp;
 1465         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
 1466         dvlp = VP2VNODELOCK(dvp);
 1467         blp = HASH2BUCKETLOCK(hash);
 1468         mtx_unlock(&nl->nl_lock);
 1469         mtx_unlock(&nl->nl_evict_lock);
 1470         mtx_lock(dvlp);
 1471         mtx_lock(blp);
 1472         /*
 1473          * Note that since all locks were dropped above, the entry may be
 1474          * gone or reallocated to be something else.
 1475          */
 1476         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
 1477                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
 1478                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
 1479                         break;
 1480         }
 1481         if (ncp2 == NULL) {
 1482                 counter_u64_add(neg_evict_skipped_missed, 1);
 1483                 ncp = NULL;
 1484                 evicted = false;
 1485         } else {
 1486                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
 1487                 MPASS(blp == NCP2BUCKETLOCK(ncp));
 1488                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
 1489                     ncp->nc_name);
 1490                 cache_zap_locked(ncp);
 1491                 counter_u64_add(neg_evicted, 1);
 1492                 evicted = true;
 1493         }
 1494         mtx_unlock(blp);
 1495         mtx_unlock(dvlp);
 1496         if (ncp != NULL)
 1497                 cache_free(ncp);
 1498         return (evicted);
 1499 }
 1500 
 1501 /*
 1502  * Maybe evict a negative entry to create more room.
 1503  *
 1504  * The ncnegfactor parameter limits what fraction of the total count
 1505  * can comprise of negative entries. However, if the cache is just
 1506  * warming up this leads to excessive evictions.  As such, ncnegminpct
 1507  * (recomputed to neg_min) dictates whether the above should be
 1508  * applied.
 1509  *
 1510  * Try evicting if the cache is close to full capacity regardless of
 1511  * other considerations.
 1512  */
 1513 static bool
 1514 cache_neg_evict_cond(u_long lnumcache)
 1515 {
 1516         u_long lnumneg;
 1517 
 1518         if (ncsize - 1000 < lnumcache)
 1519                 goto out_evict;
 1520         lnumneg = atomic_load_long(&numneg);
 1521         if (lnumneg < neg_min)
 1522                 return (false);
 1523         if (lnumneg * ncnegfactor < lnumcache)
 1524                 return (false);
 1525 out_evict:
 1526         return (cache_neg_evict());
 1527 }
 1528 
 1529 /*
 1530  * cache_zap_locked():
 1531  *
 1532  *   Removes a namecache entry from cache, whether it contains an actual
 1533  *   pointer to a vnode or if it is just a negative cache entry.
 1534  */
 1535 static void
 1536 cache_zap_locked(struct namecache *ncp)
 1537 {
 1538         struct nchashhead *ncpp;
 1539         struct vnode *dvp, *vp;
 1540 
 1541         dvp = ncp->nc_dvp;
 1542         vp = ncp->nc_vp;
 1543 
 1544         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1545                 cache_assert_vnode_locked(vp);
 1546         cache_assert_vnode_locked(dvp);
 1547         cache_assert_bucket_locked(ncp);
 1548 
 1549         cache_ncp_invalidate(ncp);
 1550 
 1551         ncpp = NCP2BUCKET(ncp);
 1552         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 1553         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1554                 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
 1555                 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
 1556                 if (ncp == vp->v_cache_dd) {
 1557                         atomic_store_ptr(&vp->v_cache_dd, NULL);
 1558                 }
 1559         } else {
 1560                 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
 1561                 cache_neg_remove(ncp);
 1562         }
 1563         if (ncp->nc_flag & NCF_ISDOTDOT) {
 1564                 if (ncp == dvp->v_cache_dd) {
 1565                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1566                 }
 1567         } else {
 1568                 LIST_REMOVE(ncp, nc_src);
 1569                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 1570                         ncp->nc_flag |= NCF_DVDROP;
 1571                 }
 1572         }
 1573 }
 1574 
 1575 static void
 1576 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 1577 {
 1578         struct mtx *blp;
 1579 
 1580         MPASS(ncp->nc_dvp == vp);
 1581         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1582         cache_assert_vnode_locked(vp);
 1583 
 1584         blp = NCP2BUCKETLOCK(ncp);
 1585         mtx_lock(blp);
 1586         cache_zap_locked(ncp);
 1587         mtx_unlock(blp);
 1588 }
 1589 
 1590 static bool
 1591 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
 1592     struct mtx **vlpp)
 1593 {
 1594         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 1595         struct mtx *blp;
 1596 
 1597         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 1598         cache_assert_vnode_locked(vp);
 1599 
 1600         if (ncp->nc_flag & NCF_NEGATIVE) {
 1601                 if (*vlpp != NULL) {
 1602                         mtx_unlock(*vlpp);
 1603                         *vlpp = NULL;
 1604                 }
 1605                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 1606                 return (true);
 1607         }
 1608 
 1609         pvlp = VP2VNODELOCK(vp);
 1610         blp = NCP2BUCKETLOCK(ncp);
 1611         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
 1612         vlp2 = VP2VNODELOCK(ncp->nc_vp);
 1613 
 1614         if (*vlpp == vlp1 || *vlpp == vlp2) {
 1615                 to_unlock = *vlpp;
 1616                 *vlpp = NULL;
 1617         } else {
 1618                 if (*vlpp != NULL) {
 1619                         mtx_unlock(*vlpp);
 1620                         *vlpp = NULL;
 1621                 }
 1622                 cache_sort_vnodes(&vlp1, &vlp2);
 1623                 if (vlp1 == pvlp) {
 1624                         mtx_lock(vlp2);
 1625                         to_unlock = vlp2;
 1626                 } else {
 1627                         if (!mtx_trylock(vlp1))
 1628                                 goto out_relock;
 1629                         to_unlock = vlp1;
 1630                 }
 1631         }
 1632         mtx_lock(blp);
 1633         cache_zap_locked(ncp);
 1634         mtx_unlock(blp);
 1635         if (to_unlock != NULL)
 1636                 mtx_unlock(to_unlock);
 1637         return (true);
 1638 
 1639 out_relock:
 1640         mtx_unlock(vlp2);
 1641         mtx_lock(vlp1);
 1642         mtx_lock(vlp2);
 1643         MPASS(*vlpp == NULL);
 1644         *vlpp = vlp1;
 1645         return (false);
 1646 }
 1647 
 1648 /*
 1649  * If trylocking failed we can get here. We know enough to take all needed locks
 1650  * in the right order and re-lookup the entry.
 1651  */
 1652 static int
 1653 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
 1654     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
 1655     struct mtx *blp)
 1656 {
 1657         struct namecache *rncp;
 1658 
 1659         cache_assert_bucket_unlocked(ncp);
 1660 
 1661         cache_sort_vnodes(&dvlp, &vlp);
 1662         cache_lock_vnodes(dvlp, vlp);
 1663         mtx_lock(blp);
 1664         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 1665                 if (rncp == ncp && rncp->nc_dvp == dvp &&
 1666                     rncp->nc_nlen == cnp->cn_namelen &&
 1667                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
 1668                         break;
 1669         }
 1670         if (rncp != NULL) {
 1671                 cache_zap_locked(rncp);
 1672                 mtx_unlock(blp);
 1673                 cache_unlock_vnodes(dvlp, vlp);
 1674                 counter_u64_add(zap_bucket_relock_success, 1);
 1675                 return (0);
 1676         }
 1677 
 1678         mtx_unlock(blp);
 1679         cache_unlock_vnodes(dvlp, vlp);
 1680         return (EAGAIN);
 1681 }
 1682 
 1683 static int __noinline
 1684 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
 1685     uint32_t hash, struct mtx *blp)
 1686 {
 1687         struct mtx *dvlp, *vlp;
 1688         struct vnode *dvp;
 1689 
 1690         cache_assert_bucket_locked(ncp);
 1691 
 1692         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1693         vlp = NULL;
 1694         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1695                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1696         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1697                 cache_zap_locked(ncp);
 1698                 mtx_unlock(blp);
 1699                 cache_unlock_vnodes(dvlp, vlp);
 1700                 return (0);
 1701         }
 1702 
 1703         dvp = ncp->nc_dvp;
 1704         mtx_unlock(blp);
 1705         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 1706 }
 1707 
 1708 static __noinline int
 1709 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
 1710 {
 1711         struct namecache *ncp;
 1712         struct mtx *blp;
 1713         struct mtx *dvlp, *dvlp2;
 1714         uint32_t hash;
 1715         int error;
 1716 
 1717         if (cnp->cn_namelen == 2 &&
 1718             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1719                 dvlp = VP2VNODELOCK(dvp);
 1720                 dvlp2 = NULL;
 1721                 mtx_lock(dvlp);
 1722 retry_dotdot:
 1723                 ncp = dvp->v_cache_dd;
 1724                 if (ncp == NULL) {
 1725                         mtx_unlock(dvlp);
 1726                         if (dvlp2 != NULL)
 1727                                 mtx_unlock(dvlp2);
 1728                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1729                         return (0);
 1730                 }
 1731                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1732                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
 1733                                 goto retry_dotdot;
 1734                         MPASS(dvp->v_cache_dd == NULL);
 1735                         mtx_unlock(dvlp);
 1736                         if (dvlp2 != NULL)
 1737                                 mtx_unlock(dvlp2);
 1738                         cache_free(ncp);
 1739                 } else {
 1740                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1741                         mtx_unlock(dvlp);
 1742                         if (dvlp2 != NULL)
 1743                                 mtx_unlock(dvlp2);
 1744                 }
 1745                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1746                 return (1);
 1747         }
 1748 
 1749         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1750         blp = HASH2BUCKETLOCK(hash);
 1751 retry:
 1752         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 1753                 goto out_no_entry;
 1754 
 1755         mtx_lock(blp);
 1756 
 1757         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1758                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1759                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1760                         break;
 1761         }
 1762 
 1763         if (ncp == NULL) {
 1764                 mtx_unlock(blp);
 1765                 goto out_no_entry;
 1766         }
 1767 
 1768         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 1769         if (__predict_false(error != 0)) {
 1770                 zap_bucket_fail++;
 1771                 goto retry;
 1772         }
 1773         counter_u64_add(numposzaps, 1);
 1774         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1775         cache_free(ncp);
 1776         return (1);
 1777 out_no_entry:
 1778         counter_u64_add(nummisszap, 1);
 1779         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1780         return (0);
 1781 }
 1782 
 1783 static int __noinline
 1784 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1785     struct timespec *tsp, int *ticksp)
 1786 {
 1787         int ltype;
 1788 
 1789         *vpp = dvp;
 1790         counter_u64_add(dothits, 1);
 1791         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 1792         if (tsp != NULL)
 1793                 timespecclear(tsp);
 1794         if (ticksp != NULL)
 1795                 *ticksp = ticks;
 1796         vrefact(*vpp);
 1797         /*
 1798          * When we lookup "." we still can be asked to lock it
 1799          * differently...
 1800          */
 1801         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 1802         if (ltype != VOP_ISLOCKED(*vpp)) {
 1803                 if (ltype == LK_EXCLUSIVE) {
 1804                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 1805                         if (VN_IS_DOOMED((*vpp))) {
 1806                                 /* forced unmount */
 1807                                 vrele(*vpp);
 1808                                 *vpp = NULL;
 1809                                 return (ENOENT);
 1810                         }
 1811                 } else
 1812                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 1813         }
 1814         return (-1);
 1815 }
 1816 
 1817 static int __noinline
 1818 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1819     struct timespec *tsp, int *ticksp)
 1820 {
 1821         struct namecache_ts *ncp_ts;
 1822         struct namecache *ncp;
 1823         struct mtx *dvlp;
 1824         enum vgetstate vs;
 1825         int error, ltype;
 1826         bool whiteout;
 1827 
 1828         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
 1829 
 1830         if ((cnp->cn_flags & MAKEENTRY) == 0) {
 1831                 cache_remove_cnp(dvp, cnp);
 1832                 return (0);
 1833         }
 1834 
 1835         counter_u64_add(dotdothits, 1);
 1836 retry:
 1837         dvlp = VP2VNODELOCK(dvp);
 1838         mtx_lock(dvlp);
 1839         ncp = dvp->v_cache_dd;
 1840         if (ncp == NULL) {
 1841                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
 1842                 mtx_unlock(dvlp);
 1843                 return (0);
 1844         }
 1845         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1846                 if (ncp->nc_flag & NCF_NEGATIVE)
 1847                         *vpp = NULL;
 1848                 else
 1849                         *vpp = ncp->nc_vp;
 1850         } else
 1851                 *vpp = ncp->nc_dvp;
 1852         if (*vpp == NULL)
 1853                 goto negative_success;
 1854         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
 1855         cache_out_ts(ncp, tsp, ticksp);
 1856         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 1857             NCF_DTS && tsp != NULL) {
 1858                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1859                 *tsp = ncp_ts->nc_dotdottime;
 1860         }
 1861 
 1862         MPASS(dvp != *vpp);
 1863         ltype = VOP_ISLOCKED(dvp);
 1864         VOP_UNLOCK(dvp);
 1865         vs = vget_prep(*vpp);
 1866         mtx_unlock(dvlp);
 1867         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1868         vn_lock(dvp, ltype | LK_RETRY);
 1869         if (VN_IS_DOOMED(dvp)) {
 1870                 if (error == 0)
 1871                         vput(*vpp);
 1872                 *vpp = NULL;
 1873                 return (ENOENT);
 1874         }
 1875         if (error) {
 1876                 *vpp = NULL;
 1877                 goto retry;
 1878         }
 1879         return (-1);
 1880 negative_success:
 1881         if (__predict_false(cnp->cn_nameiop == CREATE)) {
 1882                 if (cnp->cn_flags & ISLASTCN) {
 1883                         counter_u64_add(numnegzaps, 1);
 1884                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
 1885                         mtx_unlock(dvlp);
 1886                         cache_free(ncp);
 1887                         return (0);
 1888                 }
 1889         }
 1890 
 1891         whiteout = (ncp->nc_flag & NCF_WHITE);
 1892         cache_out_ts(ncp, tsp, ticksp);
 1893         if (cache_neg_hit_prep(ncp))
 1894                 cache_neg_promote(ncp);
 1895         else
 1896                 cache_neg_hit_finish(ncp);
 1897         mtx_unlock(dvlp);
 1898         if (whiteout)
 1899                 cnp->cn_flags |= ISWHITEOUT;
 1900         return (ENOENT);
 1901 }
 1902 
 1903 /**
 1904  * Lookup a name in the name cache
 1905  *
 1906  * # Arguments
 1907  *
 1908  * - dvp:       Parent directory in which to search.
 1909  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
 1910  * - cnp:       Parameters of the name search.  The most interesting bits of
 1911  *              the cn_flags field have the following meanings:
 1912  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
 1913  *                      it up.
 1914  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
 1915  * - tsp:       Return storage for cache timestamp.  On a successful (positive
 1916  *              or negative) lookup, tsp will be filled with any timespec that
 1917  *              was stored when this cache entry was created.  However, it will
 1918  *              be clear for "." entries.
 1919  * - ticks:     Return storage for alternate cache timestamp.  On a successful
 1920  *              (positive or negative) lookup, it will contain the ticks value
 1921  *              that was current when the cache entry was created, unless cnp
 1922  *              was ".".
 1923  *
 1924  * Either both tsp and ticks have to be provided or neither of them.
 1925  *
 1926  * # Returns
 1927  *
 1928  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
 1929  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
 1930  *              to a forced unmount.  vpp will not be modified.  If the entry
 1931  *              is a whiteout, then the ISWHITEOUT flag will be set in
 1932  *              cnp->cn_flags.
 1933  * - 0:         A cache miss.  vpp will not be modified.
 1934  *
 1935  * # Locking
 1936  *
 1937  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
 1938  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
 1939  * lock is not recursively acquired.
 1940  */
 1941 static int __noinline
 1942 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1943     struct timespec *tsp, int *ticksp)
 1944 {
 1945         struct namecache *ncp;
 1946         struct mtx *blp;
 1947         uint32_t hash;
 1948         enum vgetstate vs;
 1949         int error;
 1950         bool whiteout;
 1951 
 1952         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 1953         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
 1954 
 1955 retry:
 1956         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1957         blp = HASH2BUCKETLOCK(hash);
 1958         mtx_lock(blp);
 1959 
 1960         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1961                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1962                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1963                         break;
 1964         }
 1965 
 1966         if (__predict_false(ncp == NULL)) {
 1967                 mtx_unlock(blp);
 1968                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 1969                 counter_u64_add(nummiss, 1);
 1970                 return (0);
 1971         }
 1972 
 1973         if (ncp->nc_flag & NCF_NEGATIVE)
 1974                 goto negative_success;
 1975 
 1976         counter_u64_add(numposhits, 1);
 1977         *vpp = ncp->nc_vp;
 1978         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 1979         cache_out_ts(ncp, tsp, ticksp);
 1980         MPASS(dvp != *vpp);
 1981         vs = vget_prep(*vpp);
 1982         mtx_unlock(blp);
 1983         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1984         if (error) {
 1985                 *vpp = NULL;
 1986                 goto retry;
 1987         }
 1988         return (-1);
 1989 negative_success:
 1990         /*
 1991          * We don't get here with regular lookup apart from corner cases.
 1992          */
 1993         if (__predict_true(cnp->cn_nameiop == CREATE)) {
 1994                 if (cnp->cn_flags & ISLASTCN) {
 1995                         counter_u64_add(numnegzaps, 1);
 1996                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 1997                         if (__predict_false(error != 0)) {
 1998                                 zap_bucket_fail2++;
 1999                                 goto retry;
 2000                         }
 2001                         cache_free(ncp);
 2002                         return (0);
 2003                 }
 2004         }
 2005 
 2006         whiteout = (ncp->nc_flag & NCF_WHITE);
 2007         cache_out_ts(ncp, tsp, ticksp);
 2008         if (cache_neg_hit_prep(ncp))
 2009                 cache_neg_promote(ncp);
 2010         else
 2011                 cache_neg_hit_finish(ncp);
 2012         mtx_unlock(blp);
 2013         if (whiteout)
 2014                 cnp->cn_flags |= ISWHITEOUT;
 2015         return (ENOENT);
 2016 }
 2017 
 2018 int
 2019 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 2020     struct timespec *tsp, int *ticksp)
 2021 {
 2022         struct namecache *ncp;
 2023         uint32_t hash;
 2024         enum vgetstate vs;
 2025         int error;
 2026         bool whiteout, neg_promote;
 2027         u_short nc_flag;
 2028 
 2029         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
 2030 
 2031 #ifdef DEBUG_CACHE
 2032         if (__predict_false(!doingcache)) {
 2033                 cnp->cn_flags &= ~MAKEENTRY;
 2034                 return (0);
 2035         }
 2036 #endif
 2037 
 2038         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 2039                 if (cnp->cn_namelen == 1)
 2040                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 2041                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
 2042                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
 2043         }
 2044 
 2045         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 2046 
 2047         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
 2048                 cache_remove_cnp(dvp, cnp);
 2049                 return (0);
 2050         }
 2051 
 2052         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 2053         vfs_smr_enter();
 2054 
 2055         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 2056                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 2057                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 2058                         break;
 2059         }
 2060 
 2061         if (__predict_false(ncp == NULL)) {
 2062                 vfs_smr_exit();
 2063                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 2064                 counter_u64_add(nummiss, 1);
 2065                 return (0);
 2066         }
 2067 
 2068         nc_flag = atomic_load_char(&ncp->nc_flag);
 2069         if (nc_flag & NCF_NEGATIVE)
 2070                 goto negative_success;
 2071 
 2072         counter_u64_add(numposhits, 1);
 2073         *vpp = ncp->nc_vp;
 2074         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 2075         cache_out_ts(ncp, tsp, ticksp);
 2076         MPASS(dvp != *vpp);
 2077         if (!cache_ncp_canuse(ncp)) {
 2078                 vfs_smr_exit();
 2079                 *vpp = NULL;
 2080                 goto out_fallback;
 2081         }
 2082         vs = vget_prep_smr(*vpp);
 2083         vfs_smr_exit();
 2084         if (__predict_false(vs == VGET_NONE)) {
 2085                 *vpp = NULL;
 2086                 goto out_fallback;
 2087         }
 2088         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 2089         if (error) {
 2090                 *vpp = NULL;
 2091                 goto out_fallback;
 2092         }
 2093         return (-1);
 2094 negative_success:
 2095         if (cnp->cn_nameiop == CREATE) {
 2096                 if (cnp->cn_flags & ISLASTCN) {
 2097                         vfs_smr_exit();
 2098                         goto out_fallback;
 2099                 }
 2100         }
 2101 
 2102         cache_out_ts(ncp, tsp, ticksp);
 2103         whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
 2104         neg_promote = cache_neg_hit_prep(ncp);
 2105         if (!cache_ncp_canuse(ncp)) {
 2106                 cache_neg_hit_abort(ncp);
 2107                 vfs_smr_exit();
 2108                 goto out_fallback;
 2109         }
 2110         if (neg_promote) {
 2111                 vfs_smr_exit();
 2112                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
 2113                         goto out_fallback;
 2114         } else {
 2115                 cache_neg_hit_finish(ncp);
 2116                 vfs_smr_exit();
 2117         }
 2118         if (whiteout)
 2119                 cnp->cn_flags |= ISWHITEOUT;
 2120         return (ENOENT);
 2121 out_fallback:
 2122         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
 2123 }
 2124 
 2125 struct celockstate {
 2126         struct mtx *vlp[3];
 2127         struct mtx *blp[2];
 2128 };
 2129 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 2130 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 2131 
 2132 static inline void
 2133 cache_celockstate_init(struct celockstate *cel)
 2134 {
 2135 
 2136         bzero(cel, sizeof(*cel));
 2137 }
 2138 
 2139 static void
 2140 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
 2141     struct vnode *dvp)
 2142 {
 2143         struct mtx *vlp1, *vlp2;
 2144 
 2145         MPASS(cel->vlp[0] == NULL);
 2146         MPASS(cel->vlp[1] == NULL);
 2147         MPASS(cel->vlp[2] == NULL);
 2148 
 2149         MPASS(vp != NULL || dvp != NULL);
 2150 
 2151         vlp1 = VP2VNODELOCK(vp);
 2152         vlp2 = VP2VNODELOCK(dvp);
 2153         cache_sort_vnodes(&vlp1, &vlp2);
 2154 
 2155         if (vlp1 != NULL) {
 2156                 mtx_lock(vlp1);
 2157                 cel->vlp[0] = vlp1;
 2158         }
 2159         mtx_lock(vlp2);
 2160         cel->vlp[1] = vlp2;
 2161 }
 2162 
 2163 static void
 2164 cache_unlock_vnodes_cel(struct celockstate *cel)
 2165 {
 2166 
 2167         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 2168 
 2169         if (cel->vlp[0] != NULL)
 2170                 mtx_unlock(cel->vlp[0]);
 2171         if (cel->vlp[1] != NULL)
 2172                 mtx_unlock(cel->vlp[1]);
 2173         if (cel->vlp[2] != NULL)
 2174                 mtx_unlock(cel->vlp[2]);
 2175 }
 2176 
 2177 static bool
 2178 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 2179 {
 2180         struct mtx *vlp;
 2181         bool ret;
 2182 
 2183         cache_assert_vlp_locked(cel->vlp[0]);
 2184         cache_assert_vlp_locked(cel->vlp[1]);
 2185         MPASS(cel->vlp[2] == NULL);
 2186 
 2187         MPASS(vp != NULL);
 2188         vlp = VP2VNODELOCK(vp);
 2189 
 2190         ret = true;
 2191         if (vlp >= cel->vlp[1]) {
 2192                 mtx_lock(vlp);
 2193         } else {
 2194                 if (mtx_trylock(vlp))
 2195                         goto out;
 2196                 cache_lock_vnodes_cel_3_failures++;
 2197                 cache_unlock_vnodes_cel(cel);
 2198                 if (vlp < cel->vlp[0]) {
 2199                         mtx_lock(vlp);
 2200                         mtx_lock(cel->vlp[0]);
 2201                         mtx_lock(cel->vlp[1]);
 2202                 } else {
 2203                         if (cel->vlp[0] != NULL)
 2204                                 mtx_lock(cel->vlp[0]);
 2205                         mtx_lock(vlp);
 2206                         mtx_lock(cel->vlp[1]);
 2207                 }
 2208                 ret = false;
 2209         }
 2210 out:
 2211         cel->vlp[2] = vlp;
 2212         return (ret);
 2213 }
 2214 
 2215 static void
 2216 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
 2217     struct mtx *blp2)
 2218 {
 2219 
 2220         MPASS(cel->blp[0] == NULL);
 2221         MPASS(cel->blp[1] == NULL);
 2222 
 2223         cache_sort_vnodes(&blp1, &blp2);
 2224 
 2225         if (blp1 != NULL) {
 2226                 mtx_lock(blp1);
 2227                 cel->blp[0] = blp1;
 2228         }
 2229         mtx_lock(blp2);
 2230         cel->blp[1] = blp2;
 2231 }
 2232 
 2233 static void
 2234 cache_unlock_buckets_cel(struct celockstate *cel)
 2235 {
 2236 
 2237         if (cel->blp[0] != NULL)
 2238                 mtx_unlock(cel->blp[0]);
 2239         mtx_unlock(cel->blp[1]);
 2240 }
 2241 
 2242 /*
 2243  * Lock part of the cache affected by the insertion.
 2244  *
 2245  * This means vnodelocks for dvp, vp and the relevant bucketlock.
 2246  * However, insertion can result in removal of an old entry. In this
 2247  * case we have an additional vnode and bucketlock pair to lock.
 2248  *
 2249  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
 2250  * preserving the locking order (smaller address first).
 2251  */
 2252 static void
 2253 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2254     uint32_t hash)
 2255 {
 2256         struct namecache *ncp;
 2257         struct mtx *blps[2];
 2258         u_char nc_flag;
 2259 
 2260         blps[0] = HASH2BUCKETLOCK(hash);
 2261         for (;;) {
 2262                 blps[1] = NULL;
 2263                 cache_lock_vnodes_cel(cel, dvp, vp);
 2264                 if (vp == NULL || vp->v_type != VDIR)
 2265                         break;
 2266                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 2267                 if (ncp == NULL)
 2268                         break;
 2269                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2270                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2271                         break;
 2272                 MPASS(ncp->nc_dvp == vp);
 2273                 blps[1] = NCP2BUCKETLOCK(ncp);
 2274                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2275                         break;
 2276                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2277                         break;
 2278                 /*
 2279                  * All vnodes got re-locked. Re-validate the state and if
 2280                  * nothing changed we are done. Otherwise restart.
 2281                  */
 2282                 if (ncp == vp->v_cache_dd &&
 2283                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2284                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2285                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2286                         break;
 2287                 cache_unlock_vnodes_cel(cel);
 2288                 cel->vlp[0] = NULL;
 2289                 cel->vlp[1] = NULL;
 2290                 cel->vlp[2] = NULL;
 2291         }
 2292         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2293 }
 2294 
 2295 static void
 2296 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2297     uint32_t hash)
 2298 {
 2299         struct namecache *ncp;
 2300         struct mtx *blps[2];
 2301         u_char nc_flag;
 2302 
 2303         blps[0] = HASH2BUCKETLOCK(hash);
 2304         for (;;) {
 2305                 blps[1] = NULL;
 2306                 cache_lock_vnodes_cel(cel, dvp, vp);
 2307                 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 2308                 if (ncp == NULL)
 2309                         break;
 2310                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2311                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2312                         break;
 2313                 MPASS(ncp->nc_dvp == dvp);
 2314                 blps[1] = NCP2BUCKETLOCK(ncp);
 2315                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2316                         break;
 2317                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2318                         break;
 2319                 if (ncp == dvp->v_cache_dd &&
 2320                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2321                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2322                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2323                         break;
 2324                 cache_unlock_vnodes_cel(cel);
 2325                 cel->vlp[0] = NULL;
 2326                 cel->vlp[1] = NULL;
 2327                 cel->vlp[2] = NULL;
 2328         }
 2329         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2330 }
 2331 
 2332 static void
 2333 cache_enter_unlock(struct celockstate *cel)
 2334 {
 2335 
 2336         cache_unlock_buckets_cel(cel);
 2337         cache_unlock_vnodes_cel(cel);
 2338 }
 2339 
 2340 static void __noinline
 2341 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
 2342     struct componentname *cnp)
 2343 {
 2344         struct celockstate cel;
 2345         struct namecache *ncp;
 2346         uint32_t hash;
 2347         int len;
 2348 
 2349         if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
 2350                 return;
 2351         len = cnp->cn_namelen;
 2352         cache_celockstate_init(&cel);
 2353         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2354         cache_enter_lock_dd(&cel, dvp, vp, hash);
 2355         ncp = dvp->v_cache_dd;
 2356         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
 2357                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
 2358                 cache_zap_locked(ncp);
 2359         } else {
 2360                 ncp = NULL;
 2361         }
 2362         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 2363         cache_enter_unlock(&cel);
 2364         if (ncp != NULL)
 2365                 cache_free(ncp);
 2366 }
 2367 
 2368 /*
 2369  * Add an entry to the cache.
 2370  */
 2371 void
 2372 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 2373     struct timespec *tsp, struct timespec *dtsp)
 2374 {
 2375         struct celockstate cel;
 2376         struct namecache *ncp, *n2, *ndd;
 2377         struct namecache_ts *ncp_ts;
 2378         struct nchashhead *ncpp;
 2379         uint32_t hash;
 2380         int flag;
 2381         int len;
 2382 
 2383         KASSERT(cnp->cn_namelen <= NAME_MAX,
 2384             ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
 2385             NAME_MAX));
 2386         VNPASS(!VN_IS_DOOMED(dvp), dvp);
 2387         VNPASS(dvp->v_type != VNON, dvp);
 2388         if (vp != NULL) {
 2389                 VNPASS(!VN_IS_DOOMED(vp), vp);
 2390                 VNPASS(vp->v_type != VNON, vp);
 2391         }
 2392         if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 2393                 KASSERT(dvp == vp,
 2394                     ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
 2395                     dvp, vp));
 2396         } else {
 2397                 KASSERT(dvp != vp,
 2398                     ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
 2399                     cnp->cn_nameptr, dvp));
 2400         }
 2401 
 2402 #ifdef DEBUG_CACHE
 2403         if (__predict_false(!doingcache))
 2404                 return;
 2405 #endif
 2406 
 2407         flag = 0;
 2408         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 2409                 if (cnp->cn_namelen == 1)
 2410                         return;
 2411                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 2412                         cache_enter_dotdot_prep(dvp, vp, cnp);
 2413                         flag = NCF_ISDOTDOT;
 2414                 }
 2415         }
 2416 
 2417         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 2418         if (ncp == NULL)
 2419                 return;
 2420 
 2421         cache_celockstate_init(&cel);
 2422         ndd = NULL;
 2423         ncp_ts = NULL;
 2424 
 2425         /*
 2426          * Calculate the hash key and setup as much of the new
 2427          * namecache entry as possible before acquiring the lock.
 2428          */
 2429         ncp->nc_flag = flag | NCF_WIP;
 2430         ncp->nc_vp = vp;
 2431         if (vp == NULL)
 2432                 cache_neg_init(ncp);
 2433         ncp->nc_dvp = dvp;
 2434         if (tsp != NULL) {
 2435                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 2436                 ncp_ts->nc_time = *tsp;
 2437                 ncp_ts->nc_ticks = ticks;
 2438                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
 2439                 if (dtsp != NULL) {
 2440                         ncp_ts->nc_dotdottime = *dtsp;
 2441                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 2442                 }
 2443         }
 2444         len = ncp->nc_nlen = cnp->cn_namelen;
 2445         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2446         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
 2447         ncp->nc_name[len] = '\0';
 2448         cache_enter_lock(&cel, dvp, vp, hash);
 2449 
 2450         /*
 2451          * See if this vnode or negative entry is already in the cache
 2452          * with this name.  This can happen with concurrent lookups of
 2453          * the same path name.
 2454          */
 2455         ncpp = NCHHASH(hash);
 2456         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
 2457                 if (n2->nc_dvp == dvp &&
 2458                     n2->nc_nlen == cnp->cn_namelen &&
 2459                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 2460                         MPASS(cache_ncp_canuse(n2));
 2461                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
 2462                                 KASSERT(vp == NULL,
 2463                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 2464                                     __func__, NULL, vp, cnp->cn_nameptr));
 2465                         else
 2466                                 KASSERT(n2->nc_vp == vp,
 2467                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 2468                                     __func__, n2->nc_vp, vp, cnp->cn_nameptr));
 2469                         /*
 2470                          * Entries are supposed to be immutable unless in the
 2471                          * process of getting destroyed. Accommodating for
 2472                          * changing timestamps is possible but not worth it.
 2473                          * This should be harmless in terms of correctness, in
 2474                          * the worst case resulting in an earlier expiration.
 2475                          * Alternatively, the found entry can be replaced
 2476                          * altogether.
 2477                          */
 2478                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
 2479 #if 0
 2480                         if (tsp != NULL) {
 2481                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
 2482                                     ("no NCF_TS"));
 2483                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 2484                                 n2_ts->nc_time = ncp_ts->nc_time;
 2485                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
 2486                                 if (dtsp != NULL) {
 2487                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 2488                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
 2489                                 }
 2490                         }
 2491 #endif
 2492                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
 2493                             vp);
 2494                         goto out_unlock_free;
 2495                 }
 2496         }
 2497 
 2498         if (flag == NCF_ISDOTDOT) {
 2499                 /*
 2500                  * See if we are trying to add .. entry, but some other lookup
 2501                  * has populated v_cache_dd pointer already.
 2502                  */
 2503                 if (dvp->v_cache_dd != NULL)
 2504                         goto out_unlock_free;
 2505                 KASSERT(vp == NULL || vp->v_type == VDIR,
 2506                     ("wrong vnode type %p", vp));
 2507                 atomic_thread_fence_rel();
 2508                 atomic_store_ptr(&dvp->v_cache_dd, ncp);
 2509         }
 2510 
 2511         if (vp != NULL) {
 2512                 if (flag != NCF_ISDOTDOT) {
 2513                         /*
 2514                          * For this case, the cache entry maps both the
 2515                          * directory name in it and the name ".." for the
 2516                          * directory's parent.
 2517                          */
 2518                         if ((ndd = vp->v_cache_dd) != NULL) {
 2519                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 2520                                         cache_zap_locked(ndd);
 2521                                 else
 2522                                         ndd = NULL;
 2523                         }
 2524                         atomic_thread_fence_rel();
 2525                         atomic_store_ptr(&vp->v_cache_dd, ncp);
 2526                 } else if (vp->v_type != VDIR) {
 2527                         if (vp->v_cache_dd != NULL) {
 2528                                 atomic_store_ptr(&vp->v_cache_dd, NULL);
 2529                         }
 2530                 }
 2531         }
 2532 
 2533         if (flag != NCF_ISDOTDOT) {
 2534                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 2535                         cache_hold_vnode(dvp);
 2536                 }
 2537                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 2538         }
 2539 
 2540         /*
 2541          * If the entry is "negative", we place it into the
 2542          * "negative" cache queue, otherwise, we place it into the
 2543          * destination vnode's cache entries queue.
 2544          */
 2545         if (vp != NULL) {
 2546                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 2547                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 2548                     vp);
 2549         } else {
 2550                 if (cnp->cn_flags & ISWHITEOUT)
 2551                         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
 2552                 cache_neg_insert(ncp);
 2553                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 2554                     ncp->nc_name);
 2555         }
 2556 
 2557         /*
 2558          * Insert the new namecache entry into the appropriate chain
 2559          * within the cache entries table.
 2560          */
 2561         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 2562 
 2563         atomic_thread_fence_rel();
 2564         /*
 2565          * Mark the entry as fully constructed.
 2566          * It is immutable past this point until its removal.
 2567          */
 2568         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
 2569 
 2570         cache_enter_unlock(&cel);
 2571         if (ndd != NULL)
 2572                 cache_free(ndd);
 2573         return;
 2574 out_unlock_free:
 2575         cache_enter_unlock(&cel);
 2576         cache_free(ncp);
 2577         return;
 2578 }
 2579 
 2580 /*
 2581  * A variant of the above accepting flags.
 2582  *
 2583  * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
 2584  *
 2585  * TODO: this routine is a hack. It blindly removes the old entry, even if it
 2586  * happens to match and it is doing it in an inefficient manner. It was added
 2587  * to accommodate NFS which runs into a case where the target for a given name
 2588  * may change from under it. Note this does nothing to solve the following
 2589  * race: 2 callers of cache_enter_time_flags pass a different target vnode for
 2590  * the same [dvp, cnp]. It may be argued that code doing this is broken.
 2591  */
 2592 void
 2593 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 2594     struct timespec *tsp, struct timespec *dtsp, int flags)
 2595 {
 2596 
 2597         MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
 2598 
 2599         if (flags & VFS_CACHE_DROPOLD)
 2600                 cache_remove_cnp(dvp, cnp);
 2601         cache_enter_time(dvp, vp, cnp, tsp, dtsp);
 2602 }
 2603 
 2604 static u_int
 2605 cache_roundup_2(u_int val)
 2606 {
 2607         u_int res;
 2608 
 2609         for (res = 1; res <= val; res <<= 1)
 2610                 continue;
 2611 
 2612         return (res);
 2613 }
 2614 
 2615 static struct nchashhead *
 2616 nchinittbl(u_long elements, u_long *hashmask)
 2617 {
 2618         struct nchashhead *hashtbl;
 2619         u_long hashsize, i;
 2620 
 2621         hashsize = cache_roundup_2(elements) / 2;
 2622 
 2623         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
 2624         for (i = 0; i < hashsize; i++)
 2625                 CK_SLIST_INIT(&hashtbl[i]);
 2626         *hashmask = hashsize - 1;
 2627         return (hashtbl);
 2628 }
 2629 
 2630 static void
 2631 ncfreetbl(struct nchashhead *hashtbl)
 2632 {
 2633 
 2634         free(hashtbl, M_VFSCACHE);
 2635 }
 2636 
 2637 /*
 2638  * Name cache initialization, from vfs_init() when we are booting
 2639  */
 2640 static void
 2641 nchinit(void *dummy __unused)
 2642 {
 2643         u_int i;
 2644 
 2645         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
 2646             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2647         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
 2648             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2649         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
 2650             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2651         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
 2652             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2653 
 2654         VFS_SMR_ZONE_SET(cache_zone_small);
 2655         VFS_SMR_ZONE_SET(cache_zone_small_ts);
 2656         VFS_SMR_ZONE_SET(cache_zone_large);
 2657         VFS_SMR_ZONE_SET(cache_zone_large_ts);
 2658 
 2659         ncsize = desiredvnodes * ncsizefactor;
 2660         cache_recalc_neg_min(ncnegminpct);
 2661         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
 2662         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
 2663         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
 2664                 ncbuckethash = 7;
 2665         if (ncbuckethash > nchash)
 2666                 ncbuckethash = nchash;
 2667         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 2668             M_WAITOK | M_ZERO);
 2669         for (i = 0; i < numbucketlocks; i++)
 2670                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
 2671         ncvnodehash = ncbuckethash;
 2672         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 2673             M_WAITOK | M_ZERO);
 2674         for (i = 0; i < numvnodelocks; i++)
 2675                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 2676 
 2677         for (i = 0; i < numneglists; i++) {
 2678                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
 2679                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 2680                 TAILQ_INIT(&neglists[i].nl_list);
 2681                 TAILQ_INIT(&neglists[i].nl_hotlist);
 2682         }
 2683 }
 2684 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 2685 
 2686 void
 2687 cache_vnode_init(struct vnode *vp)
 2688 {
 2689 
 2690         LIST_INIT(&vp->v_cache_src);
 2691         TAILQ_INIT(&vp->v_cache_dst);
 2692         vp->v_cache_dd = NULL;
 2693         cache_prehash(vp);
 2694 }
 2695 
 2696 /*
 2697  * Induce transient cache misses for lockless operation in cache_lookup() by
 2698  * using a temporary hash table.
 2699  *
 2700  * This will force a fs lookup.
 2701  *
 2702  * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
 2703  * to observe all CPUs not performing the lookup.
 2704  */
 2705 static void
 2706 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
 2707 {
 2708 
 2709         MPASS(temphash < nchash);
 2710         /*
 2711          * Change the size. The new size is smaller and can safely be used
 2712          * against the existing table. All lookups which now hash wrong will
 2713          * result in a cache miss, which all callers are supposed to know how
 2714          * to handle.
 2715          */
 2716         atomic_store_long(&nchash, temphash);
 2717         atomic_thread_fence_rel();
 2718         vfs_smr_synchronize();
 2719         /*
 2720          * At this point everyone sees the updated hash value, but they still
 2721          * see the old table.
 2722          */
 2723         atomic_store_ptr(&nchashtbl, temptbl);
 2724         atomic_thread_fence_rel();
 2725         vfs_smr_synchronize();
 2726         /*
 2727          * At this point everyone sees the updated table pointer and size pair.
 2728          */
 2729 }
 2730 
 2731 /*
 2732  * Set the new hash table.
 2733  *
 2734  * Similarly to cache_changesize_set_temp(), this has to synchronize against
 2735  * lockless operation in cache_lookup().
 2736  */
 2737 static void
 2738 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
 2739 {
 2740 
 2741         MPASS(nchash < new_hash);
 2742         /*
 2743          * Change the pointer first. This wont result in out of bounds access
 2744          * since the temporary table is guaranteed to be smaller.
 2745          */
 2746         atomic_store_ptr(&nchashtbl, new_tbl);
 2747         atomic_thread_fence_rel();
 2748         vfs_smr_synchronize();
 2749         /*
 2750          * At this point everyone sees the updated pointer value, but they
 2751          * still see the old size.
 2752          */
 2753         atomic_store_long(&nchash, new_hash);
 2754         atomic_thread_fence_rel();
 2755         vfs_smr_synchronize();
 2756         /*
 2757          * At this point everyone sees the updated table pointer and size pair.
 2758          */
 2759 }
 2760 
 2761 void
 2762 cache_changesize(u_long newmaxvnodes)
 2763 {
 2764         struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
 2765         u_long new_nchash, old_nchash, temphash;
 2766         struct namecache *ncp;
 2767         uint32_t hash;
 2768         u_long newncsize;
 2769         int i;
 2770 
 2771         newncsize = newmaxvnodes * ncsizefactor;
 2772         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 2773         if (newmaxvnodes < numbucketlocks)
 2774                 newmaxvnodes = numbucketlocks;
 2775 
 2776         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
 2777         /* If same hash table size, nothing to do */
 2778         if (nchash == new_nchash) {
 2779                 ncfreetbl(new_nchashtbl);
 2780                 return;
 2781         }
 2782 
 2783         temptbl = nchinittbl(1, &temphash);
 2784 
 2785         /*
 2786          * Move everything from the old hash table to the new table.
 2787          * None of the namecache entries in the table can be removed
 2788          * because to do so, they have to be removed from the hash table.
 2789          */
 2790         cache_lock_all_vnodes();
 2791         cache_lock_all_buckets();
 2792         old_nchashtbl = nchashtbl;
 2793         old_nchash = nchash;
 2794         cache_changesize_set_temp(temptbl, temphash);
 2795         for (i = 0; i <= old_nchash; i++) {
 2796                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
 2797                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 2798                             ncp->nc_dvp);
 2799                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
 2800                         CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
 2801                 }
 2802         }
 2803         ncsize = newncsize;
 2804         cache_recalc_neg_min(ncnegminpct);
 2805         cache_changesize_set_new(new_nchashtbl, new_nchash);
 2806         cache_unlock_all_buckets();
 2807         cache_unlock_all_vnodes();
 2808         ncfreetbl(old_nchashtbl);
 2809         ncfreetbl(temptbl);
 2810 }
 2811 
 2812 /*
 2813  * Remove all entries from and to a particular vnode.
 2814  */
 2815 static void
 2816 cache_purge_impl(struct vnode *vp)
 2817 {
 2818         struct cache_freebatch batch;
 2819         struct namecache *ncp;
 2820         struct mtx *vlp, *vlp2;
 2821 
 2822         TAILQ_INIT(&batch);
 2823         vlp = VP2VNODELOCK(vp);
 2824         vlp2 = NULL;
 2825         mtx_lock(vlp);
 2826 retry:
 2827         while (!LIST_EMPTY(&vp->v_cache_src)) {
 2828                 ncp = LIST_FIRST(&vp->v_cache_src);
 2829                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2830                         goto retry;
 2831                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2832         }
 2833         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 2834                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 2835                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2836                         goto retry;
 2837                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2838         }
 2839         ncp = vp->v_cache_dd;
 2840         if (ncp != NULL) {
 2841                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 2842                    ("lost dotdot link"));
 2843                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2844                         goto retry;
 2845                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2846         }
 2847         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 2848         mtx_unlock(vlp);
 2849         if (vlp2 != NULL)
 2850                 mtx_unlock(vlp2);
 2851         cache_free_batch(&batch);
 2852 }
 2853 
 2854 /*
 2855  * Opportunistic check to see if there is anything to do.
 2856  */
 2857 static bool
 2858 cache_has_entries(struct vnode *vp)
 2859 {
 2860 
 2861         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 2862             atomic_load_ptr(&vp->v_cache_dd) == NULL)
 2863                 return (false);
 2864         return (true);
 2865 }
 2866 
 2867 void
 2868 cache_purge(struct vnode *vp)
 2869 {
 2870 
 2871         SDT_PROBE1(vfs, namecache, purge, done, vp);
 2872         if (!cache_has_entries(vp))
 2873                 return;
 2874         cache_purge_impl(vp);
 2875 }
 2876 
 2877 /*
 2878  * Only to be used by vgone.
 2879  */
 2880 void
 2881 cache_purge_vgone(struct vnode *vp)
 2882 {
 2883         struct mtx *vlp;
 2884 
 2885         VNPASS(VN_IS_DOOMED(vp), vp);
 2886         if (cache_has_entries(vp)) {
 2887                 cache_purge_impl(vp);
 2888                 return;
 2889         }
 2890 
 2891         /*
 2892          * Serialize against a potential thread doing cache_purge.
 2893          */
 2894         vlp = VP2VNODELOCK(vp);
 2895         mtx_wait_unlocked(vlp);
 2896         if (cache_has_entries(vp)) {
 2897                 cache_purge_impl(vp);
 2898                 return;
 2899         }
 2900         return;
 2901 }
 2902 
 2903 /*
 2904  * Remove all negative entries for a particular directory vnode.
 2905  */
 2906 void
 2907 cache_purge_negative(struct vnode *vp)
 2908 {
 2909         struct cache_freebatch batch;
 2910         struct namecache *ncp, *nnp;
 2911         struct mtx *vlp;
 2912 
 2913         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 2914         if (LIST_EMPTY(&vp->v_cache_src))
 2915                 return;
 2916         TAILQ_INIT(&batch);
 2917         vlp = VP2VNODELOCK(vp);
 2918         mtx_lock(vlp);
 2919         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 2920                 if (!(ncp->nc_flag & NCF_NEGATIVE))
 2921                         continue;
 2922                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 2923                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2924         }
 2925         mtx_unlock(vlp);
 2926         cache_free_batch(&batch);
 2927 }
 2928 
 2929 /*
 2930  * Entry points for modifying VOP operations.
 2931  */
 2932 void
 2933 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
 2934     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
 2935 {
 2936 
 2937         ASSERT_VOP_IN_SEQC(fdvp);
 2938         ASSERT_VOP_IN_SEQC(fvp);
 2939         ASSERT_VOP_IN_SEQC(tdvp);
 2940         if (tvp != NULL)
 2941                 ASSERT_VOP_IN_SEQC(tvp);
 2942 
 2943         cache_purge(fvp);
 2944         if (tvp != NULL) {
 2945                 cache_purge(tvp);
 2946                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
 2947                     ("%s: lingering negative entry", __func__));
 2948         } else {
 2949                 cache_remove_cnp(tdvp, tcnp);
 2950         }
 2951 
 2952         /*
 2953          * TODO
 2954          *
 2955          * Historically renaming was always purging all revelang entries,
 2956          * but that's quite wasteful. In particular turns out that in many cases
 2957          * the target file is immediately accessed after rename, inducing a cache
 2958          * miss.
 2959          *
 2960          * Recode this to reduce relocking and reuse the existing entry (if any)
 2961          * instead of just removing it above and allocating a new one here.
 2962          */
 2963         cache_enter(tdvp, fvp, tcnp);
 2964 }
 2965 
 2966 void
 2967 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
 2968 {
 2969 
 2970         ASSERT_VOP_IN_SEQC(dvp);
 2971         ASSERT_VOP_IN_SEQC(vp);
 2972         cache_purge(vp);
 2973 }
 2974 
 2975 #ifdef INVARIANTS
 2976 /*
 2977  * Validate that if an entry exists it matches.
 2978  */
 2979 void
 2980 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 2981 {
 2982         struct namecache *ncp;
 2983         struct mtx *blp;
 2984         uint32_t hash;
 2985 
 2986         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 2987         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 2988                 return;
 2989         blp = HASH2BUCKETLOCK(hash);
 2990         mtx_lock(blp);
 2991         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 2992                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 2993                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
 2994                         if (ncp->nc_vp != vp)
 2995                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
 2996                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
 2997                 }
 2998         }
 2999         mtx_unlock(blp);
 3000 }
 3001 
 3002 void
 3003 cache_assert_no_entries(struct vnode *vp)
 3004 {
 3005 
 3006         VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
 3007         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 3008         VNPASS(vp->v_cache_dd == NULL, vp);
 3009 }
 3010 #endif
 3011 
 3012 /*
 3013  * Flush all entries referencing a particular filesystem.
 3014  */
 3015 void
 3016 cache_purgevfs(struct mount *mp)
 3017 {
 3018         struct vnode *vp, *mvp;
 3019         size_t visited, purged;
 3020 
 3021         visited = purged = 0;
 3022         /*
 3023          * Somewhat wasteful iteration over all vnodes. Would be better to
 3024          * support filtering and avoid the interlock to begin with.
 3025          */
 3026         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 3027                 visited++;
 3028                 if (!cache_has_entries(vp)) {
 3029                         VI_UNLOCK(vp);
 3030                         continue;
 3031                 }
 3032                 vholdl(vp);
 3033                 VI_UNLOCK(vp);
 3034                 cache_purge(vp);
 3035                 purged++;
 3036                 vdrop(vp);
 3037         }
 3038 
 3039         SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
 3040 }
 3041 
 3042 /*
 3043  * Perform canonical checks and cache lookup and pass on to filesystem
 3044  * through the vop_cachedlookup only if needed.
 3045  */
 3046 
 3047 int
 3048 vfs_cache_lookup(struct vop_lookup_args *ap)
 3049 {
 3050         struct vnode *dvp;
 3051         int error;
 3052         struct vnode **vpp = ap->a_vpp;
 3053         struct componentname *cnp = ap->a_cnp;
 3054         int flags = cnp->cn_flags;
 3055 
 3056         *vpp = NULL;
 3057         dvp = ap->a_dvp;
 3058 
 3059         if (dvp->v_type != VDIR)
 3060                 return (ENOTDIR);
 3061 
 3062         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 3063             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 3064                 return (EROFS);
 3065 
 3066         error = vn_dir_check_exec(dvp, cnp);
 3067         if (error != 0)
 3068                 return (error);
 3069 
 3070         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 3071         if (error == 0)
 3072                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 3073         if (error == -1)
 3074                 return (0);
 3075         return (error);
 3076 }
 3077 
 3078 /* Implementation of the getcwd syscall. */
 3079 int
 3080 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 3081 {
 3082         char *buf, *retbuf;
 3083         size_t buflen;
 3084         int error;
 3085 
 3086         buflen = uap->buflen;
 3087         if (__predict_false(buflen < 2))
 3088                 return (EINVAL);
 3089         if (buflen > MAXPATHLEN)
 3090                 buflen = MAXPATHLEN;
 3091 
 3092         buf = uma_zalloc(namei_zone, M_WAITOK);
 3093         error = vn_getcwd(buf, &retbuf, &buflen);
 3094         if (error == 0)
 3095                 error = copyout(retbuf, uap->buf, buflen);
 3096         uma_zfree(namei_zone, buf);
 3097         return (error);
 3098 }
 3099 
 3100 int
 3101 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
 3102 {
 3103         struct pwd *pwd;
 3104         int error;
 3105 
 3106         vfs_smr_enter();
 3107         pwd = pwd_get_smr();
 3108         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
 3109             buflen, 0);
 3110         VFS_SMR_ASSERT_NOT_ENTERED();
 3111         if (error < 0) {
 3112                 pwd = pwd_hold(curthread);
 3113                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
 3114                     retbuf, buflen);
 3115                 pwd_drop(pwd);
 3116         }
 3117 
 3118 #ifdef KTRACE
 3119         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
 3120                 ktrnamei(*retbuf);
 3121 #endif
 3122         return (error);
 3123 }
 3124 
 3125 /*
 3126  * Canonicalize a path by walking it forward and back.
 3127  *
 3128  * BUGS:
 3129  * - Nothing guarantees the integrity of the entire chain. Consider the case
 3130  *   where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
 3131  *   "foo" into "quux" during the backwards walk. The result will be
 3132  *   "quux/bar/baz/qux", which could not have been obtained by an incremental
 3133  *   walk in userspace. Moreover, the path we return is inaccessible if the
 3134  *   calling thread lacks permission to traverse "quux".
 3135  */
 3136 static int
 3137 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
 3138     size_t size, int flags, enum uio_seg pathseg)
 3139 {
 3140         struct nameidata nd;
 3141         char *retbuf, *freebuf;
 3142         int error;
 3143 
 3144         if (flags != 0)
 3145                 return (EINVAL);
 3146         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
 3147             pathseg, path, fd, &cap_fstat_rights);
 3148         if ((error = namei(&nd)) != 0)
 3149                 return (error);
 3150 
 3151         if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
 3152             (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
 3153                 /*
 3154                  * This happens if vp is a file mount. The call to
 3155                  * vn_fullpath_hardlink can panic if path resolution can't be
 3156                  * handled without the directory.
 3157                  *
 3158                  * To resolve this, we find the vnode which was mounted on -
 3159                  * this should have a unique global path since we disallow
 3160                  * mounting on linked files.
 3161                  */
 3162                 struct vnode *covered_vp;
 3163                 error = vn_lock(nd.ni_vp, LK_SHARED);
 3164                 if (error != 0)
 3165                         goto out;
 3166                 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
 3167                 vref(covered_vp);
 3168                 VOP_UNLOCK(nd.ni_vp);
 3169                 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
 3170                 vrele(covered_vp);
 3171         } else {
 3172                 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr,
 3173                     nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size);
 3174         }
 3175         if (error == 0) {
 3176                 error = copyout(retbuf, buf, size);
 3177                 free(freebuf, M_TEMP);
 3178         }
 3179 out:
 3180         vrele(nd.ni_vp);
 3181         vrele(nd.ni_dvp);
 3182         NDFREE_PNBUF(&nd);
 3183         return (error);
 3184 }
 3185 
 3186 int
 3187 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
 3188 {
 3189 
 3190         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
 3191             uap->flags, UIO_USERSPACE));
 3192 }
 3193 
 3194 /*
 3195  * Retrieve the full filesystem path that correspond to a vnode from the name
 3196  * cache (if available)
 3197  */
 3198 int
 3199 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
 3200 {
 3201         struct pwd *pwd;
 3202         char *buf;
 3203         size_t buflen;
 3204         int error;
 3205 
 3206         if (__predict_false(vp == NULL))
 3207                 return (EINVAL);
 3208 
 3209         buflen = MAXPATHLEN;
 3210         buf = malloc(buflen, M_TEMP, M_WAITOK);
 3211         vfs_smr_enter();
 3212         pwd = pwd_get_smr();
 3213         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
 3214         VFS_SMR_ASSERT_NOT_ENTERED();
 3215         if (error < 0) {
 3216                 pwd = pwd_hold(curthread);
 3217                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
 3218                 pwd_drop(pwd);
 3219         }
 3220         if (error == 0)
 3221                 *freebuf = buf;
 3222         else
 3223                 free(buf, M_TEMP);
 3224         return (error);
 3225 }
 3226 
 3227 /*
 3228  * This function is similar to vn_fullpath, but it attempts to lookup the
 3229  * pathname relative to the global root mount point.  This is required for the
 3230  * auditing sub-system, as audited pathnames must be absolute, relative to the
 3231  * global root mount point.
 3232  */
 3233 int
 3234 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
 3235 {
 3236         char *buf;
 3237         size_t buflen;
 3238         int error;
 3239 
 3240         if (__predict_false(vp == NULL))
 3241                 return (EINVAL);
 3242         buflen = MAXPATHLEN;
 3243         buf = malloc(buflen, M_TEMP, M_WAITOK);
 3244         vfs_smr_enter();
 3245         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
 3246         VFS_SMR_ASSERT_NOT_ENTERED();
 3247         if (error < 0) {
 3248                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
 3249         }
 3250         if (error == 0)
 3251                 *freebuf = buf;
 3252         else
 3253                 free(buf, M_TEMP);
 3254         return (error);
 3255 }
 3256 
 3257 static struct namecache *
 3258 vn_dd_from_dst(struct vnode *vp)
 3259 {
 3260         struct namecache *ncp;
 3261 
 3262         cache_assert_vnode_locked(vp);
 3263         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
 3264                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3265                         return (ncp);
 3266         }
 3267         return (NULL);
 3268 }
 3269 
 3270 int
 3271 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
 3272 {
 3273         struct vnode *dvp;
 3274         struct namecache *ncp;
 3275         struct mtx *vlp;
 3276         int error;
 3277 
 3278         vlp = VP2VNODELOCK(*vp);
 3279         mtx_lock(vlp);
 3280         ncp = (*vp)->v_cache_dd;
 3281         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
 3282                 KASSERT(ncp == vn_dd_from_dst(*vp),
 3283                     ("%s: mismatch for dd entry (%p != %p)", __func__,
 3284                     ncp, vn_dd_from_dst(*vp)));
 3285         } else {
 3286                 ncp = vn_dd_from_dst(*vp);
 3287         }
 3288         if (ncp != NULL) {
 3289                 if (*buflen < ncp->nc_nlen) {
 3290                         mtx_unlock(vlp);
 3291                         vrele(*vp);
 3292                         counter_u64_add(numfullpathfail4, 1);
 3293                         error = ENOMEM;
 3294                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3295                             vp, NULL);
 3296                         return (error);
 3297                 }
 3298                 *buflen -= ncp->nc_nlen;
 3299                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3300                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 3301                     ncp->nc_name, vp);
 3302                 dvp = *vp;
 3303                 *vp = ncp->nc_dvp;
 3304                 vref(*vp);
 3305                 mtx_unlock(vlp);
 3306                 vrele(dvp);
 3307                 return (0);
 3308         }
 3309         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 3310 
 3311         mtx_unlock(vlp);
 3312         vn_lock(*vp, LK_SHARED | LK_RETRY);
 3313         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
 3314         vput(*vp);
 3315         if (error) {
 3316                 counter_u64_add(numfullpathfail2, 1);
 3317                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 3318                 return (error);
 3319         }
 3320 
 3321         *vp = dvp;
 3322         if (VN_IS_DOOMED(dvp)) {
 3323                 /* forced unmount */
 3324                 vrele(dvp);
 3325                 error = ENOENT;
 3326                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 3327                 return (error);
 3328         }
 3329         /*
 3330          * *vp has its use count incremented still.
 3331          */
 3332 
 3333         return (0);
 3334 }
 3335 
 3336 /*
 3337  * Resolve a directory to a pathname.
 3338  *
 3339  * The name of the directory can always be found in the namecache or fetched
 3340  * from the filesystem. There is also guaranteed to be only one parent, meaning
 3341  * we can just follow vnodes up until we find the root.
 3342  *
 3343  * The vnode must be referenced.
 3344  */
 3345 static int
 3346 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3347     size_t *len, size_t addend)
 3348 {
 3349 #ifdef KDTRACE_HOOKS
 3350         struct vnode *startvp = vp;
 3351 #endif
 3352         struct vnode *vp1;
 3353         size_t buflen;
 3354         int error;
 3355         bool slash_prefixed;
 3356 
 3357         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
 3358         VNPASS(vp->v_usecount > 0, vp);
 3359 
 3360         buflen = *len;
 3361 
 3362         slash_prefixed = true;
 3363         if (addend == 0) {
 3364                 MPASS(*len >= 2);
 3365                 buflen--;
 3366                 buf[buflen] = '\0';
 3367                 slash_prefixed = false;
 3368         }
 3369 
 3370         error = 0;
 3371 
 3372         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 3373         counter_u64_add(numfullpathcalls, 1);
 3374         while (vp != rdir && vp != rootvnode) {
 3375                 /*
 3376                  * The vp vnode must be already fully constructed,
 3377                  * since it is either found in namecache or obtained
 3378                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 3379                  * without obtaining the vnode lock.
 3380                  */
 3381                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3382                         vn_lock(vp, LK_RETRY | LK_SHARED);
 3383 
 3384                         /*
 3385                          * With the vnode locked, check for races with
 3386                          * unmount, forced or not.  Note that we
 3387                          * already verified that vp is not equal to
 3388                          * the root vnode, which means that
 3389                          * mnt_vnodecovered can be NULL only for the
 3390                          * case of unmount.
 3391                          */
 3392                         if (VN_IS_DOOMED(vp) ||
 3393                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 3394                             vp1->v_mountedhere != vp->v_mount) {
 3395                                 vput(vp);
 3396                                 error = ENOENT;
 3397                                 SDT_PROBE3(vfs, namecache, fullpath, return,
 3398                                     error, vp, NULL);
 3399                                 break;
 3400                         }
 3401 
 3402                         vref(vp1);
 3403                         vput(vp);
 3404                         vp = vp1;
 3405                         continue;
 3406                 }
 3407                 if (vp->v_type != VDIR) {
 3408                         vrele(vp);
 3409                         counter_u64_add(numfullpathfail1, 1);
 3410                         error = ENOTDIR;
 3411                         SDT_PROBE3(vfs, namecache, fullpath, return,
 3412                             error, vp, NULL);
 3413                         break;
 3414                 }
 3415                 error = vn_vptocnp(&vp, buf, &buflen);
 3416                 if (error)
 3417                         break;
 3418                 if (buflen == 0) {
 3419                         vrele(vp);
 3420                         error = ENOMEM;
 3421                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3422                             startvp, NULL);
 3423                         break;
 3424                 }
 3425                 buf[--buflen] = '/';
 3426                 slash_prefixed = true;
 3427         }
 3428         if (error)
 3429                 return (error);
 3430         if (!slash_prefixed) {
 3431                 if (buflen == 0) {
 3432                         vrele(vp);
 3433                         counter_u64_add(numfullpathfail4, 1);
 3434                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 3435                             startvp, NULL);
 3436                         return (ENOMEM);
 3437                 }
 3438                 buf[--buflen] = '/';
 3439         }
 3440         counter_u64_add(numfullpathfound, 1);
 3441         vrele(vp);
 3442 
 3443         *retbuf = buf + buflen;
 3444         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
 3445         *len -= buflen;
 3446         *len += addend;
 3447         return (0);
 3448 }
 3449 
 3450 /*
 3451  * Resolve an arbitrary vnode to a pathname.
 3452  *
 3453  * Note 2 caveats:
 3454  * - hardlinks are not tracked, thus if the vnode is not a directory this can
 3455  *   resolve to a different path than the one used to find it
 3456  * - namecache is not mandatory, meaning names are not guaranteed to be added
 3457  *   (in which case resolving fails)
 3458  */
 3459 static void __inline
 3460 cache_rev_failed_impl(int *reason, int line)
 3461 {
 3462 
 3463         *reason = line;
 3464 }
 3465 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
 3466 
 3467 static int
 3468 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 3469     char **retbuf, size_t *buflen, size_t addend)
 3470 {
 3471 #ifdef KDTRACE_HOOKS
 3472         struct vnode *startvp = vp;
 3473 #endif
 3474         struct vnode *tvp;
 3475         struct mount *mp;
 3476         struct namecache *ncp;
 3477         size_t orig_buflen;
 3478         int reason;
 3479         int error;
 3480 #ifdef KDTRACE_HOOKS
 3481         int i;
 3482 #endif
 3483         seqc_t vp_seqc, tvp_seqc;
 3484         u_char nc_flag;
 3485 
 3486         VFS_SMR_ASSERT_ENTERED();
 3487 
 3488         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 3489                 vfs_smr_exit();
 3490                 return (-1);
 3491         }
 3492 
 3493         orig_buflen = *buflen;
 3494 
 3495         if (addend == 0) {
 3496                 MPASS(*buflen >= 2);
 3497                 *buflen -= 1;
 3498                 buf[*buflen] = '\0';
 3499         }
 3500 
 3501         if (vp == rdir || vp == rootvnode) {
 3502                 if (addend == 0) {
 3503                         *buflen -= 1;
 3504                         buf[*buflen] = '/';
 3505                 }
 3506                 goto out_ok;
 3507         }
 3508 
 3509 #ifdef KDTRACE_HOOKS
 3510         i = 0;
 3511 #endif
 3512         error = -1;
 3513         ncp = NULL; /* for sdt probe down below */
 3514         vp_seqc = vn_seqc_read_any(vp);
 3515         if (seqc_in_modify(vp_seqc)) {
 3516                 cache_rev_failed(&reason);
 3517                 goto out_abort;
 3518         }
 3519 
 3520         for (;;) {
 3521 #ifdef KDTRACE_HOOKS
 3522                 i++;
 3523 #endif
 3524                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3525                         mp = atomic_load_ptr(&vp->v_mount);
 3526                         if (mp == NULL) {
 3527                                 cache_rev_failed(&reason);
 3528                                 goto out_abort;
 3529                         }
 3530                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
 3531                         tvp_seqc = vn_seqc_read_any(tvp);
 3532                         if (seqc_in_modify(tvp_seqc)) {
 3533                                 cache_rev_failed(&reason);
 3534                                 goto out_abort;
 3535                         }
 3536                         if (!vn_seqc_consistent(vp, vp_seqc)) {
 3537                                 cache_rev_failed(&reason);
 3538                                 goto out_abort;
 3539                         }
 3540                         vp = tvp;
 3541                         vp_seqc = tvp_seqc;
 3542                         continue;
 3543                 }
 3544                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 3545                 if (ncp == NULL) {
 3546                         cache_rev_failed(&reason);
 3547                         goto out_abort;
 3548                 }
 3549                 nc_flag = atomic_load_char(&ncp->nc_flag);
 3550                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
 3551                         cache_rev_failed(&reason);
 3552                         goto out_abort;
 3553                 }
 3554                 if (ncp->nc_nlen >= *buflen) {
 3555                         cache_rev_failed(&reason);
 3556                         error = ENOMEM;
 3557                         goto out_abort;
 3558                 }
 3559                 *buflen -= ncp->nc_nlen;
 3560                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3561                 *buflen -= 1;
 3562                 buf[*buflen] = '/';
 3563                 tvp = ncp->nc_dvp;
 3564                 tvp_seqc = vn_seqc_read_any(tvp);
 3565                 if (seqc_in_modify(tvp_seqc)) {
 3566                         cache_rev_failed(&reason);
 3567                         goto out_abort;
 3568                 }
 3569                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 3570                         cache_rev_failed(&reason);
 3571                         goto out_abort;
 3572                 }
 3573                 /*
 3574                  * Acquire fence provided by vn_seqc_read_any above.
 3575                  */
 3576                 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
 3577                         cache_rev_failed(&reason);
 3578                         goto out_abort;
 3579                 }
 3580                 if (!cache_ncp_canuse(ncp)) {
 3581                         cache_rev_failed(&reason);
 3582                         goto out_abort;
 3583                 }
 3584                 vp = tvp;
 3585                 vp_seqc = tvp_seqc;
 3586                 if (vp == rdir || vp == rootvnode)
 3587                         break;
 3588         }
 3589 out_ok:
 3590         vfs_smr_exit();
 3591         *retbuf = buf + *buflen;
 3592         *buflen = orig_buflen - *buflen + addend;
 3593         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
 3594         return (0);
 3595 
 3596 out_abort:
 3597         *buflen = orig_buflen;
 3598         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
 3599         vfs_smr_exit();
 3600         return (error);
 3601 }
 3602 
 3603 static int
 3604 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3605     size_t *buflen)
 3606 {
 3607         size_t orig_buflen, addend;
 3608         int error;
 3609 
 3610         if (*buflen < 2)
 3611                 return (EINVAL);
 3612 
 3613         orig_buflen = *buflen;
 3614 
 3615         vref(vp);
 3616         addend = 0;
 3617         if (vp->v_type != VDIR) {
 3618                 *buflen -= 1;
 3619                 buf[*buflen] = '\0';
 3620                 error = vn_vptocnp(&vp, buf, buflen);
 3621                 if (error)
 3622                         return (error);
 3623                 if (*buflen == 0) {
 3624                         vrele(vp);
 3625                         return (ENOMEM);
 3626                 }
 3627                 *buflen -= 1;
 3628                 buf[*buflen] = '/';
 3629                 addend = orig_buflen - *buflen;
 3630         }
 3631 
 3632         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
 3633 }
 3634 
 3635 /*
 3636  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
 3637  *
 3638  * Since the namecache does not track hardlinks, the caller is expected to
 3639  * first look up the target vnode with WANTPARENT flag passed to namei to get
 3640  * dvp and vp.
 3641  *
 3642  * Then we have 2 cases:
 3643  * - if the found vnode is a directory, the path can be constructed just by
 3644  *   following names up the chain
 3645  * - otherwise we populate the buffer with the saved name and start resolving
 3646  *   from the parent
 3647  */
 3648 int
 3649 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
 3650     const char *hrdl_name, size_t hrdl_name_length,
 3651     char **retbuf, char **freebuf, size_t *buflen)
 3652 {
 3653         char *buf, *tmpbuf;
 3654         struct pwd *pwd;
 3655         size_t addend;
 3656         int error;
 3657         enum vtype type;
 3658 
 3659         if (*buflen < 2)
 3660                 return (EINVAL);
 3661         if (*buflen > MAXPATHLEN)
 3662                 *buflen = MAXPATHLEN;
 3663 
 3664         buf = malloc(*buflen, M_TEMP, M_WAITOK);
 3665 
 3666         addend = 0;
 3667 
 3668         /*
 3669          * Check for VBAD to work around the vp_crossmp bug in lookup().
 3670          *
 3671          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
 3672          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
 3673          * If the type is VDIR (like in this very case) we can skip looking
 3674          * at ni_dvp in the first place. However, since vnodes get passed here
 3675          * unlocked the target may transition to doomed state (type == VBAD)
 3676          * before we get to evaluate the condition. If this happens, we will
 3677          * populate part of the buffer and descend to vn_fullpath_dir with
 3678          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
 3679          *
 3680          * This should be atomic_load(&vp->v_type) but it is illegal to take
 3681          * an address of a bit field, even if said field is sized to char.
 3682          * Work around the problem by reading the value into a full-sized enum
 3683          * and then re-reading it with atomic_load which will still prevent
 3684          * the compiler from re-reading down the road.
 3685          */
 3686         type = vp->v_type;
 3687         type = atomic_load_int(&type);
 3688         if (type == VBAD) {
 3689                 error = ENOENT;
 3690                 goto out_bad;
 3691         }
 3692         if (type != VDIR) {
 3693                 addend = hrdl_name_length + 2;
 3694                 if (*buflen < addend) {
 3695                         error = ENOMEM;
 3696                         goto out_bad;
 3697                 }
 3698                 *buflen -= addend;
 3699                 tmpbuf = buf + *buflen;
 3700                 tmpbuf[0] = '/';
 3701                 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
 3702                 tmpbuf[addend - 1] = '\0';
 3703                 vp = dvp;
 3704         }
 3705 
 3706         vfs_smr_enter();
 3707         pwd = pwd_get_smr();
 3708         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3709             addend);
 3710         VFS_SMR_ASSERT_NOT_ENTERED();
 3711         if (error < 0) {
 3712                 pwd = pwd_hold(curthread);
 3713                 vref(vp);
 3714                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3715                     addend);
 3716                 pwd_drop(pwd);
 3717         }
 3718         if (error != 0)
 3719                 goto out_bad;
 3720 
 3721         *freebuf = buf;
 3722 
 3723         return (0);
 3724 out_bad:
 3725         free(buf, M_TEMP);
 3726         return (error);
 3727 }
 3728 
 3729 struct vnode *
 3730 vn_dir_dd_ino(struct vnode *vp)
 3731 {
 3732         struct namecache *ncp;
 3733         struct vnode *ddvp;
 3734         struct mtx *vlp;
 3735         enum vgetstate vs;
 3736 
 3737         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 3738         vlp = VP2VNODELOCK(vp);
 3739         mtx_lock(vlp);
 3740         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 3741                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 3742                         continue;
 3743                 ddvp = ncp->nc_dvp;
 3744                 vs = vget_prep(ddvp);
 3745                 mtx_unlock(vlp);
 3746                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
 3747                         return (NULL);
 3748                 return (ddvp);
 3749         }
 3750         mtx_unlock(vlp);
 3751         return (NULL);
 3752 }
 3753 
 3754 int
 3755 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 3756 {
 3757         struct namecache *ncp;
 3758         struct mtx *vlp;
 3759         int l;
 3760 
 3761         vlp = VP2VNODELOCK(vp);
 3762         mtx_lock(vlp);
 3763         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 3764                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3765                         break;
 3766         if (ncp == NULL) {
 3767                 mtx_unlock(vlp);
 3768                 return (ENOENT);
 3769         }
 3770         l = min(ncp->nc_nlen, buflen - 1);
 3771         memcpy(buf, ncp->nc_name, l);
 3772         mtx_unlock(vlp);
 3773         buf[l] = '\0';
 3774         return (0);
 3775 }
 3776 
 3777 /*
 3778  * This function updates path string to vnode's full global path
 3779  * and checks the size of the new path string against the pathlen argument.
 3780  *
 3781  * Requires a locked, referenced vnode.
 3782  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 3783  *
 3784  * If vp is a directory, the call to vn_fullpath_global() always succeeds
 3785  * because it falls back to the ".." lookup if the namecache lookup fails.
 3786  */
 3787 int
 3788 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
 3789     u_int pathlen)
 3790 {
 3791         struct nameidata nd;
 3792         struct vnode *vp1;
 3793         char *rpath, *fbuf;
 3794         int error;
 3795 
 3796         ASSERT_VOP_ELOCKED(vp, __func__);
 3797 
 3798         /* Construct global filesystem path from vp. */
 3799         VOP_UNLOCK(vp);
 3800         error = vn_fullpath_global(vp, &rpath, &fbuf);
 3801 
 3802         if (error != 0) {
 3803                 vrele(vp);
 3804                 return (error);
 3805         }
 3806 
 3807         if (strlen(rpath) >= pathlen) {
 3808                 vrele(vp);
 3809                 error = ENAMETOOLONG;
 3810                 goto out;
 3811         }
 3812 
 3813         /*
 3814          * Re-lookup the vnode by path to detect a possible rename.
 3815          * As a side effect, the vnode is relocked.
 3816          * If vnode was renamed, return ENOENT.
 3817          */
 3818         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
 3819         error = namei(&nd);
 3820         if (error != 0) {
 3821                 vrele(vp);
 3822                 goto out;
 3823         }
 3824         NDFREE_PNBUF(&nd);
 3825         vp1 = nd.ni_vp;
 3826         vrele(vp);
 3827         if (vp1 == vp)
 3828                 strcpy(path, rpath);
 3829         else {
 3830                 vput(vp1);
 3831                 error = ENOENT;
 3832         }
 3833 
 3834 out:
 3835         free(fbuf, M_TEMP);
 3836         return (error);
 3837 }
 3838 
 3839 /*
 3840  * This is similar to vn_path_to_global_path but allows for regular
 3841  * files which may not be present in the cache.
 3842  *
 3843  * Requires a locked, referenced vnode.
 3844  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 3845  */
 3846 int
 3847 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
 3848     struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
 3849     size_t leaf_length)
 3850 {
 3851         struct nameidata nd;
 3852         struct vnode *vp1;
 3853         char *rpath, *fbuf;
 3854         size_t len;
 3855         int error;
 3856 
 3857         ASSERT_VOP_ELOCKED(vp, __func__);
 3858 
 3859         /*
 3860          * Construct global filesystem path from dvp, vp and leaf
 3861          * name.
 3862          */
 3863         VOP_UNLOCK(vp);
 3864         error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
 3865             &rpath, &fbuf, &len);
 3866 
 3867         if (error != 0) {
 3868                 vrele(vp);
 3869                 goto out;
 3870         }
 3871 
 3872         if (strlen(rpath) >= pathlen) {
 3873                 vrele(vp);
 3874                 error = ENAMETOOLONG;
 3875                 goto out;
 3876         }
 3877 
 3878         /*
 3879          * Re-lookup the vnode by path to detect a possible rename.
 3880          * As a side effect, the vnode is relocked.
 3881          * If vnode was renamed, return ENOENT.
 3882          */
 3883         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
 3884         error = namei(&nd);
 3885         if (error != 0) {
 3886                 vrele(vp);
 3887                 goto out;
 3888         }
 3889         NDFREE_PNBUF(&nd);
 3890         vp1 = nd.ni_vp;
 3891         vrele(vp);
 3892         if (vp1 == vp)
 3893                 strcpy(path, rpath);
 3894         else {
 3895                 vput(vp1);
 3896                 error = ENOENT;
 3897         }
 3898 
 3899 out:
 3900         free(fbuf, M_TEMP);
 3901         return (error);
 3902 }
 3903 
 3904 #ifdef DDB
 3905 static void
 3906 db_print_vpath(struct vnode *vp)
 3907 {
 3908 
 3909         while (vp != NULL) {
 3910                 db_printf("%p: ", vp);
 3911                 if (vp == rootvnode) {
 3912                         db_printf("/");
 3913                         vp = NULL;
 3914                 } else {
 3915                         if (vp->v_vflag & VV_ROOT) {
 3916                                 db_printf("<mount point>");
 3917                                 vp = vp->v_mount->mnt_vnodecovered;
 3918                         } else {
 3919                                 struct namecache *ncp;
 3920                                 char *ncn;
 3921                                 int i;
 3922 
 3923                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 3924                                 if (ncp != NULL) {
 3925                                         ncn = ncp->nc_name;
 3926                                         for (i = 0; i < ncp->nc_nlen; i++)
 3927                                                 db_printf("%c", *ncn++);
 3928                                         vp = ncp->nc_dvp;
 3929                                 } else {
 3930                                         vp = NULL;
 3931                                 }
 3932                         }
 3933                 }
 3934                 db_printf("\n");
 3935         }
 3936 
 3937         return;
 3938 }
 3939 
 3940 DB_SHOW_COMMAND(vpath, db_show_vpath)
 3941 {
 3942         struct vnode *vp;
 3943 
 3944         if (!have_addr) {
 3945                 db_printf("usage: show vpath <struct vnode *>\n");
 3946                 return;
 3947         }
 3948 
 3949         vp = (struct vnode *)addr;
 3950         db_print_vpath(vp);
 3951 }
 3952 
 3953 #endif
 3954 
 3955 static int cache_fast_lookup = 1;
 3956 
 3957 #define CACHE_FPL_FAILED        -2020
 3958 
 3959 void
 3960 cache_fast_lookup_enabled_recalc(void)
 3961 {
 3962         int lookup_flag;
 3963         int mac_on;
 3964 
 3965 #ifdef MAC
 3966         mac_on = mac_vnode_check_lookup_enabled();
 3967         mac_on |= mac_vnode_check_readlink_enabled();
 3968 #else
 3969         mac_on = 0;
 3970 #endif
 3971 
 3972         lookup_flag = atomic_load_int(&cache_fast_lookup);
 3973         if (lookup_flag && !mac_on) {
 3974                 atomic_store_char(&cache_fast_lookup_enabled, true);
 3975         } else {
 3976                 atomic_store_char(&cache_fast_lookup_enabled, false);
 3977         }
 3978 }
 3979 
 3980 static int
 3981 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
 3982 {
 3983         int error, old;
 3984 
 3985         old = atomic_load_int(&cache_fast_lookup);
 3986         error = sysctl_handle_int(oidp, arg1, arg2, req);
 3987         if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
 3988                 cache_fast_lookup_enabled_recalc();
 3989         return (error);
 3990 }
 3991 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
 3992     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
 3993 
 3994 /*
 3995  * Components of nameidata (or objects it can point to) which may
 3996  * need restoring in case fast path lookup fails.
 3997  */
 3998 struct nameidata_outer {
 3999         size_t ni_pathlen;
 4000         int cn_flags;
 4001 };
 4002 
 4003 struct nameidata_saved {
 4004 #ifdef INVARIANTS
 4005         char *cn_nameptr;
 4006         size_t ni_pathlen;
 4007 #endif
 4008 };
 4009 
 4010 #ifdef INVARIANTS
 4011 struct cache_fpl_debug {
 4012         size_t ni_pathlen;
 4013 };
 4014 #endif
 4015 
 4016 struct cache_fpl {
 4017         struct nameidata *ndp;
 4018         struct componentname *cnp;
 4019         char *nulchar;
 4020         struct vnode *dvp;
 4021         struct vnode *tvp;
 4022         seqc_t dvp_seqc;
 4023         seqc_t tvp_seqc;
 4024         uint32_t hash;
 4025         struct nameidata_saved snd;
 4026         struct nameidata_outer snd_outer;
 4027         int line;
 4028         enum cache_fpl_status status:8;
 4029         bool in_smr;
 4030         bool fsearch;
 4031         struct pwd **pwd;
 4032 #ifdef INVARIANTS
 4033         struct cache_fpl_debug debug;
 4034 #endif
 4035 };
 4036 
 4037 static bool cache_fplookup_mp_supported(struct mount *mp);
 4038 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
 4039 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
 4040 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
 4041 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
 4042 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
 4043 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
 4044 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
 4045 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
 4046 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
 4047 
 4048 static void
 4049 cache_fpl_cleanup_cnp(struct componentname *cnp)
 4050 {
 4051 
 4052         uma_zfree(namei_zone, cnp->cn_pnbuf);
 4053         cnp->cn_pnbuf = NULL;
 4054         cnp->cn_nameptr = NULL;
 4055 }
 4056 
 4057 static struct vnode *
 4058 cache_fpl_handle_root(struct cache_fpl *fpl)
 4059 {
 4060         struct nameidata *ndp;
 4061         struct componentname *cnp;
 4062 
 4063         ndp = fpl->ndp;
 4064         cnp = fpl->cnp;
 4065 
 4066         MPASS(*(cnp->cn_nameptr) == '/');
 4067         cnp->cn_nameptr++;
 4068         cache_fpl_pathlen_dec(fpl);
 4069 
 4070         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 4071                 do {
 4072                         cnp->cn_nameptr++;
 4073                         cache_fpl_pathlen_dec(fpl);
 4074                 } while (*(cnp->cn_nameptr) == '/');
 4075         }
 4076 
 4077         return (ndp->ni_rootdir);
 4078 }
 4079 
 4080 static void
 4081 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
 4082 {
 4083 
 4084         fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
 4085         fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
 4086 }
 4087 
 4088 static void
 4089 cache_fpl_checkpoint(struct cache_fpl *fpl)
 4090 {
 4091 
 4092 #ifdef INVARIANTS
 4093         fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
 4094         fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
 4095 #endif
 4096 }
 4097 
 4098 static void
 4099 cache_fpl_restore_partial(struct cache_fpl *fpl)
 4100 {
 4101 
 4102         fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
 4103 #ifdef INVARIANTS
 4104         fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
 4105 #endif
 4106 }
 4107 
 4108 static void
 4109 cache_fpl_restore_abort(struct cache_fpl *fpl)
 4110 {
 4111 
 4112         cache_fpl_restore_partial(fpl);
 4113         /*
 4114          * It is 0 on entry by API contract.
 4115          */
 4116         fpl->ndp->ni_resflags = 0;
 4117         fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
 4118         fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
 4119 }
 4120 
 4121 #ifdef INVARIANTS
 4122 #define cache_fpl_smr_assert_entered(fpl) ({                    \
 4123         struct cache_fpl *_fpl = (fpl);                         \
 4124         MPASS(_fpl->in_smr == true);                            \
 4125         VFS_SMR_ASSERT_ENTERED();                               \
 4126 })
 4127 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
 4128         struct cache_fpl *_fpl = (fpl);                         \
 4129         MPASS(_fpl->in_smr == false);                           \
 4130         VFS_SMR_ASSERT_NOT_ENTERED();                           \
 4131 })
 4132 static void
 4133 cache_fpl_assert_status(struct cache_fpl *fpl)
 4134 {
 4135 
 4136         switch (fpl->status) {
 4137         case CACHE_FPL_STATUS_UNSET:
 4138                 __assert_unreachable();
 4139                 break;
 4140         case CACHE_FPL_STATUS_DESTROYED:
 4141         case CACHE_FPL_STATUS_ABORTED:
 4142         case CACHE_FPL_STATUS_PARTIAL:
 4143         case CACHE_FPL_STATUS_HANDLED:
 4144                 break;
 4145         }
 4146 }
 4147 #else
 4148 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
 4149 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
 4150 #define cache_fpl_assert_status(fpl) do { } while (0)
 4151 #endif
 4152 
 4153 #define cache_fpl_smr_enter_initial(fpl) ({                     \
 4154         struct cache_fpl *_fpl = (fpl);                         \
 4155         vfs_smr_enter();                                        \
 4156         _fpl->in_smr = true;                                    \
 4157 })
 4158 
 4159 #define cache_fpl_smr_enter(fpl) ({                             \
 4160         struct cache_fpl *_fpl = (fpl);                         \
 4161         MPASS(_fpl->in_smr == false);                           \
 4162         vfs_smr_enter();                                        \
 4163         _fpl->in_smr = true;                                    \
 4164 })
 4165 
 4166 #define cache_fpl_smr_exit(fpl) ({                              \
 4167         struct cache_fpl *_fpl = (fpl);                         \
 4168         MPASS(_fpl->in_smr == true);                            \
 4169         vfs_smr_exit();                                         \
 4170         _fpl->in_smr = false;                                   \
 4171 })
 4172 
 4173 static int
 4174 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
 4175 {
 4176 
 4177         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 4178                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 4179                     ("%s: converting to abort from %d at %d, set at %d\n",
 4180                     __func__, fpl->status, line, fpl->line));
 4181         }
 4182         cache_fpl_smr_assert_not_entered(fpl);
 4183         fpl->status = CACHE_FPL_STATUS_ABORTED;
 4184         fpl->line = line;
 4185         return (CACHE_FPL_FAILED);
 4186 }
 4187 
 4188 #define cache_fpl_aborted_early(x)      cache_fpl_aborted_early_impl((x), __LINE__)
 4189 
 4190 static int __noinline
 4191 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 4192 {
 4193         struct nameidata *ndp;
 4194         struct componentname *cnp;
 4195 
 4196         ndp = fpl->ndp;
 4197         cnp = fpl->cnp;
 4198 
 4199         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 4200                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 4201                     ("%s: converting to abort from %d at %d, set at %d\n",
 4202                     __func__, fpl->status, line, fpl->line));
 4203         }
 4204         fpl->status = CACHE_FPL_STATUS_ABORTED;
 4205         fpl->line = line;
 4206         if (fpl->in_smr)
 4207                 cache_fpl_smr_exit(fpl);
 4208         cache_fpl_restore_abort(fpl);
 4209         /*
 4210          * Resolving symlinks overwrites data passed by the caller.
 4211          * Let namei know.
 4212          */
 4213         if (ndp->ni_loopcnt > 0) {
 4214                 fpl->status = CACHE_FPL_STATUS_DESTROYED;
 4215                 cache_fpl_cleanup_cnp(cnp);
 4216         }
 4217         return (CACHE_FPL_FAILED);
 4218 }
 4219 
 4220 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
 4221 
 4222 static int __noinline
 4223 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
 4224 {
 4225 
 4226         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4227             ("%s: setting to partial at %d, but already set to %d at %d\n",
 4228             __func__, line, fpl->status, fpl->line));
 4229         cache_fpl_smr_assert_entered(fpl);
 4230         fpl->status = CACHE_FPL_STATUS_PARTIAL;
 4231         fpl->line = line;
 4232         return (cache_fplookup_partial_setup(fpl));
 4233 }
 4234 
 4235 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
 4236 
 4237 static int
 4238 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
 4239 {
 4240 
 4241         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4242             ("%s: setting to handled at %d, but already set to %d at %d\n",
 4243             __func__, line, fpl->status, fpl->line));
 4244         cache_fpl_smr_assert_not_entered(fpl);
 4245         fpl->status = CACHE_FPL_STATUS_HANDLED;
 4246         fpl->line = line;
 4247         return (0);
 4248 }
 4249 
 4250 #define cache_fpl_handled(x)    cache_fpl_handled_impl((x), __LINE__)
 4251 
 4252 static int
 4253 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
 4254 {
 4255 
 4256         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4257             ("%s: setting to handled at %d, but already set to %d at %d\n",
 4258             __func__, line, fpl->status, fpl->line));
 4259         MPASS(error != 0);
 4260         MPASS(error != CACHE_FPL_FAILED);
 4261         cache_fpl_smr_assert_not_entered(fpl);
 4262         fpl->status = CACHE_FPL_STATUS_HANDLED;
 4263         fpl->line = line;
 4264         fpl->dvp = NULL;
 4265         fpl->tvp = NULL;
 4266         return (error);
 4267 }
 4268 
 4269 #define cache_fpl_handled_error(x, e)   cache_fpl_handled_error_impl((x), (e), __LINE__)
 4270 
 4271 static bool
 4272 cache_fpl_terminated(struct cache_fpl *fpl)
 4273 {
 4274 
 4275         return (fpl->status != CACHE_FPL_STATUS_UNSET);
 4276 }
 4277 
 4278 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
 4279         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
 4280          FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | WILLBEDIR | \
 4281          ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
 4282          OPENWRITE | WANTIOCTLCAPS)
 4283 
 4284 #define CACHE_FPL_INTERNAL_CN_FLAGS \
 4285         (ISDOTDOT | MAKEENTRY | ISLASTCN)
 4286 
 4287 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 4288     "supported and internal flags overlap");
 4289 
 4290 static bool
 4291 cache_fpl_islastcn(struct nameidata *ndp)
 4292 {
 4293 
 4294         return (*ndp->ni_next == 0);
 4295 }
 4296 
 4297 static bool
 4298 cache_fpl_istrailingslash(struct cache_fpl *fpl)
 4299 {
 4300 
 4301         MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
 4302         return (*(fpl->nulchar - 1) == '/');
 4303 }
 4304 
 4305 static bool
 4306 cache_fpl_isdotdot(struct componentname *cnp)
 4307 {
 4308 
 4309         if (cnp->cn_namelen == 2 &&
 4310             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 4311                 return (true);
 4312         return (false);
 4313 }
 4314 
 4315 static bool
 4316 cache_can_fplookup(struct cache_fpl *fpl)
 4317 {
 4318         struct nameidata *ndp;
 4319         struct componentname *cnp;
 4320         struct thread *td;
 4321 
 4322         ndp = fpl->ndp;
 4323         cnp = fpl->cnp;
 4324         td = curthread;
 4325 
 4326         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 4327                 cache_fpl_aborted_early(fpl);
 4328                 return (false);
 4329         }
 4330         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
 4331                 cache_fpl_aborted_early(fpl);
 4332                 return (false);
 4333         }
 4334         if (IN_CAPABILITY_MODE(td)) {
 4335                 cache_fpl_aborted_early(fpl);
 4336                 return (false);
 4337         }
 4338         if (AUDITING_TD(td)) {
 4339                 cache_fpl_aborted_early(fpl);
 4340                 return (false);
 4341         }
 4342         if (ndp->ni_startdir != NULL) {
 4343                 cache_fpl_aborted_early(fpl);
 4344                 return (false);
 4345         }
 4346         return (true);
 4347 }
 4348 
 4349 static int __noinline
 4350 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
 4351 {
 4352         struct nameidata *ndp;
 4353         struct componentname *cnp;
 4354         int error;
 4355         bool fsearch;
 4356 
 4357         ndp = fpl->ndp;
 4358         cnp = fpl->cnp;
 4359 
 4360         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
 4361         if (__predict_false(error != 0)) {
 4362                 return (cache_fpl_aborted(fpl));
 4363         }
 4364         fpl->fsearch = fsearch;
 4365         if ((*vpp)->v_type != VDIR) {
 4366                 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
 4367                         cache_fpl_smr_exit(fpl);
 4368                         return (cache_fpl_handled_error(fpl, ENOTDIR));
 4369                 }
 4370         }
 4371         return (0);
 4372 }
 4373 
 4374 static int __noinline
 4375 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
 4376     uint32_t hash)
 4377 {
 4378         struct componentname *cnp;
 4379         struct vnode *dvp;
 4380 
 4381         cnp = fpl->cnp;
 4382         dvp = fpl->dvp;
 4383 
 4384         cache_fpl_smr_exit(fpl);
 4385         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
 4386                 return (cache_fpl_handled_error(fpl, ENOENT));
 4387         else
 4388                 return (cache_fpl_aborted(fpl));
 4389 }
 4390 
 4391 /*
 4392  * The target vnode is not supported, prepare for the slow path to take over.
 4393  */
 4394 static int __noinline
 4395 cache_fplookup_partial_setup(struct cache_fpl *fpl)
 4396 {
 4397         struct nameidata *ndp;
 4398         struct componentname *cnp;
 4399         enum vgetstate dvs;
 4400         struct vnode *dvp;
 4401         struct pwd *pwd;
 4402         seqc_t dvp_seqc;
 4403 
 4404         ndp = fpl->ndp;
 4405         cnp = fpl->cnp;
 4406         pwd = *(fpl->pwd);
 4407         dvp = fpl->dvp;
 4408         dvp_seqc = fpl->dvp_seqc;
 4409 
 4410         if (!pwd_hold_smr(pwd)) {
 4411                 return (cache_fpl_aborted(fpl));
 4412         }
 4413 
 4414         /*
 4415          * Note that seqc is checked before the vnode is locked, so by
 4416          * the time regular lookup gets to it it may have moved.
 4417          *
 4418          * Ultimately this does not affect correctness, any lookup errors
 4419          * are userspace racing with itself. It is guaranteed that any
 4420          * path which ultimately gets found could also have been found
 4421          * by regular lookup going all the way in absence of concurrent
 4422          * modifications.
 4423          */
 4424         dvs = vget_prep_smr(dvp);
 4425         cache_fpl_smr_exit(fpl);
 4426         if (__predict_false(dvs == VGET_NONE)) {
 4427                 pwd_drop(pwd);
 4428                 return (cache_fpl_aborted(fpl));
 4429         }
 4430 
 4431         vget_finish_ref(dvp, dvs);
 4432         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4433                 vrele(dvp);
 4434                 pwd_drop(pwd);
 4435                 return (cache_fpl_aborted(fpl));
 4436         }
 4437 
 4438         cache_fpl_restore_partial(fpl);
 4439 #ifdef INVARIANTS
 4440         if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
 4441                 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
 4442                     cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
 4443         }
 4444 #endif
 4445 
 4446         ndp->ni_startdir = dvp;
 4447         cnp->cn_flags |= MAKEENTRY;
 4448         if (cache_fpl_islastcn(ndp))
 4449                 cnp->cn_flags |= ISLASTCN;
 4450         if (cache_fpl_isdotdot(cnp))
 4451                 cnp->cn_flags |= ISDOTDOT;
 4452 
 4453         /*
 4454          * Skip potential extra slashes parsing did not take care of.
 4455          * cache_fplookup_skip_slashes explains the mechanism.
 4456          */
 4457         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 4458                 do {
 4459                         cnp->cn_nameptr++;
 4460                         cache_fpl_pathlen_dec(fpl);
 4461                 } while (*(cnp->cn_nameptr) == '/');
 4462         }
 4463 
 4464         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 4465 #ifdef INVARIANTS
 4466         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 4467                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 4468                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 4469                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 4470         }
 4471 #endif
 4472         return (0);
 4473 }
 4474 
 4475 static int
 4476 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
 4477 {
 4478         struct componentname *cnp;
 4479         struct vnode *tvp;
 4480         seqc_t tvp_seqc;
 4481         int error, lkflags;
 4482 
 4483         cnp = fpl->cnp;
 4484         tvp = fpl->tvp;
 4485         tvp_seqc = fpl->tvp_seqc;
 4486 
 4487         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4488                 lkflags = LK_SHARED;
 4489                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4490                         lkflags = LK_EXCLUSIVE;
 4491                 error = vget_finish(tvp, lkflags, tvs);
 4492                 if (__predict_false(error != 0)) {
 4493                         return (cache_fpl_aborted(fpl));
 4494                 }
 4495         } else {
 4496                 vget_finish_ref(tvp, tvs);
 4497         }
 4498 
 4499         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 4500                 if ((cnp->cn_flags & LOCKLEAF) != 0)
 4501                         vput(tvp);
 4502                 else
 4503                         vrele(tvp);
 4504                 return (cache_fpl_aborted(fpl));
 4505         }
 4506 
 4507         return (cache_fpl_handled(fpl));
 4508 }
 4509 
 4510 /*
 4511  * They want to possibly modify the state of the namecache.
 4512  */
 4513 static int __noinline
 4514 cache_fplookup_final_modifying(struct cache_fpl *fpl)
 4515 {
 4516         struct nameidata *ndp __diagused;
 4517         struct componentname *cnp;
 4518         enum vgetstate dvs;
 4519         struct vnode *dvp, *tvp;
 4520         struct mount *mp;
 4521         seqc_t dvp_seqc;
 4522         int error;
 4523         bool docache;
 4524 
 4525         ndp = fpl->ndp;
 4526         cnp = fpl->cnp;
 4527         dvp = fpl->dvp;
 4528         dvp_seqc = fpl->dvp_seqc;
 4529 
 4530         MPASS(*(cnp->cn_nameptr) != '/');
 4531         MPASS(cache_fpl_islastcn(ndp));
 4532         if ((cnp->cn_flags & LOCKPARENT) == 0)
 4533                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
 4534         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
 4535         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
 4536             cnp->cn_nameiop == RENAME);
 4537         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4538         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4539 
 4540         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 4541         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
 4542                 docache = false;
 4543 
 4544         /*
 4545          * Regular lookup nulifies the slash, which we don't do here.
 4546          * Don't take chances with filesystem routines seeing it for
 4547          * the last entry.
 4548          */
 4549         if (cache_fpl_istrailingslash(fpl)) {
 4550                 return (cache_fpl_partial(fpl));
 4551         }
 4552 
 4553         mp = atomic_load_ptr(&dvp->v_mount);
 4554         if (__predict_false(mp == NULL)) {
 4555                 return (cache_fpl_aborted(fpl));
 4556         }
 4557 
 4558         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
 4559                 cache_fpl_smr_exit(fpl);
 4560                 /*
 4561                  * Original code keeps not checking for CREATE which
 4562                  * might be a bug. For now let the old lookup decide.
 4563                  */
 4564                 if (cnp->cn_nameiop == CREATE) {
 4565                         return (cache_fpl_aborted(fpl));
 4566                 }
 4567                 return (cache_fpl_handled_error(fpl, EROFS));
 4568         }
 4569 
 4570         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
 4571                 cache_fpl_smr_exit(fpl);
 4572                 return (cache_fpl_handled_error(fpl, EEXIST));
 4573         }
 4574 
 4575         /*
 4576          * Secure access to dvp; check cache_fplookup_partial_setup for
 4577          * reasoning.
 4578          *
 4579          * XXX At least UFS requires its lookup routine to be called for
 4580          * the last path component, which leads to some level of complication
 4581          * and inefficiency:
 4582          * - the target routine always locks the target vnode, but our caller
 4583          *   may not need it locked
 4584          * - some of the VOP machinery asserts that the parent is locked, which
 4585          *   once more may be not required
 4586          *
 4587          * TODO: add a flag for filesystems which don't need this.
 4588          */
 4589         dvs = vget_prep_smr(dvp);
 4590         cache_fpl_smr_exit(fpl);
 4591         if (__predict_false(dvs == VGET_NONE)) {
 4592                 return (cache_fpl_aborted(fpl));
 4593         }
 4594 
 4595         vget_finish_ref(dvp, dvs);
 4596         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4597                 vrele(dvp);
 4598                 return (cache_fpl_aborted(fpl));
 4599         }
 4600 
 4601         error = vn_lock(dvp, LK_EXCLUSIVE);
 4602         if (__predict_false(error != 0)) {
 4603                 vrele(dvp);
 4604                 return (cache_fpl_aborted(fpl));
 4605         }
 4606 
 4607         tvp = NULL;
 4608         cnp->cn_flags |= ISLASTCN;
 4609         if (docache)
 4610                 cnp->cn_flags |= MAKEENTRY;
 4611         if (cache_fpl_isdotdot(cnp))
 4612                 cnp->cn_flags |= ISDOTDOT;
 4613         cnp->cn_lkflags = LK_EXCLUSIVE;
 4614         error = VOP_LOOKUP(dvp, &tvp, cnp);
 4615         switch (error) {
 4616         case EJUSTRETURN:
 4617         case 0:
 4618                 break;
 4619         case ENOTDIR:
 4620         case ENOENT:
 4621                 vput(dvp);
 4622                 return (cache_fpl_handled_error(fpl, error));
 4623         default:
 4624                 vput(dvp);
 4625                 return (cache_fpl_aborted(fpl));
 4626         }
 4627 
 4628         fpl->tvp = tvp;
 4629 
 4630         if (tvp == NULL) {
 4631                 MPASS(error == EJUSTRETURN);
 4632                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4633                         VOP_UNLOCK(dvp);
 4634                 }
 4635                 return (cache_fpl_handled(fpl));
 4636         }
 4637 
 4638         /*
 4639          * There are very hairy corner cases concerning various flag combinations
 4640          * and locking state. In particular here we only hold one lock instead of
 4641          * two.
 4642          *
 4643          * Skip the complexity as it is of no significance for normal workloads.
 4644          */
 4645         if (__predict_false(tvp == dvp)) {
 4646                 vput(dvp);
 4647                 vrele(tvp);
 4648                 return (cache_fpl_aborted(fpl));
 4649         }
 4650 
 4651         /*
 4652          * If they want the symlink itself we are fine, but if they want to
 4653          * follow it regular lookup has to be engaged.
 4654          */
 4655         if (tvp->v_type == VLNK) {
 4656                 if ((cnp->cn_flags & FOLLOW) != 0) {
 4657                         vput(dvp);
 4658                         vput(tvp);
 4659                         return (cache_fpl_aborted(fpl));
 4660                 }
 4661         }
 4662 
 4663         /*
 4664          * Since we expect this to be the terminal vnode it should almost never
 4665          * be a mount point.
 4666          */
 4667         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 4668                 vput(dvp);
 4669                 vput(tvp);
 4670                 return (cache_fpl_aborted(fpl));
 4671         }
 4672 
 4673         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
 4674                 vput(dvp);
 4675                 vput(tvp);
 4676                 return (cache_fpl_handled_error(fpl, EEXIST));
 4677         }
 4678 
 4679         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 4680                 VOP_UNLOCK(tvp);
 4681         }
 4682 
 4683         if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4684                 VOP_UNLOCK(dvp);
 4685         }
 4686 
 4687         return (cache_fpl_handled(fpl));
 4688 }
 4689 
 4690 static int __noinline
 4691 cache_fplookup_modifying(struct cache_fpl *fpl)
 4692 {
 4693         struct nameidata *ndp;
 4694 
 4695         ndp = fpl->ndp;
 4696 
 4697         if (!cache_fpl_islastcn(ndp)) {
 4698                 return (cache_fpl_partial(fpl));
 4699         }
 4700         return (cache_fplookup_final_modifying(fpl));
 4701 }
 4702 
 4703 static int __noinline
 4704 cache_fplookup_final_withparent(struct cache_fpl *fpl)
 4705 {
 4706         struct componentname *cnp;
 4707         enum vgetstate dvs, tvs;
 4708         struct vnode *dvp, *tvp;
 4709         seqc_t dvp_seqc;
 4710         int error;
 4711 
 4712         cnp = fpl->cnp;
 4713         dvp = fpl->dvp;
 4714         dvp_seqc = fpl->dvp_seqc;
 4715         tvp = fpl->tvp;
 4716 
 4717         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
 4718 
 4719         /*
 4720          * This is less efficient than it can be for simplicity.
 4721          */
 4722         dvs = vget_prep_smr(dvp);
 4723         if (__predict_false(dvs == VGET_NONE)) {
 4724                 return (cache_fpl_aborted(fpl));
 4725         }
 4726         tvs = vget_prep_smr(tvp);
 4727         if (__predict_false(tvs == VGET_NONE)) {
 4728                 cache_fpl_smr_exit(fpl);
 4729                 vget_abort(dvp, dvs);
 4730                 return (cache_fpl_aborted(fpl));
 4731         }
 4732 
 4733         cache_fpl_smr_exit(fpl);
 4734 
 4735         if ((cnp->cn_flags & LOCKPARENT) != 0) {
 4736                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
 4737                 if (__predict_false(error != 0)) {
 4738                         vget_abort(tvp, tvs);
 4739                         return (cache_fpl_aborted(fpl));
 4740                 }
 4741         } else {
 4742                 vget_finish_ref(dvp, dvs);
 4743         }
 4744 
 4745         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4746                 vget_abort(tvp, tvs);
 4747                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4748                         vput(dvp);
 4749                 else
 4750                         vrele(dvp);
 4751                 return (cache_fpl_aborted(fpl));
 4752         }
 4753 
 4754         error = cache_fplookup_final_child(fpl, tvs);
 4755         if (__predict_false(error != 0)) {
 4756                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
 4757                     fpl->status == CACHE_FPL_STATUS_DESTROYED);
 4758                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4759                         vput(dvp);
 4760                 else
 4761                         vrele(dvp);
 4762                 return (error);
 4763         }
 4764 
 4765         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
 4766         return (0);
 4767 }
 4768 
 4769 static int
 4770 cache_fplookup_final(struct cache_fpl *fpl)
 4771 {
 4772         struct componentname *cnp;
 4773         enum vgetstate tvs;
 4774         struct vnode *dvp, *tvp;
 4775         seqc_t dvp_seqc;
 4776 
 4777         cnp = fpl->cnp;
 4778         dvp = fpl->dvp;
 4779         dvp_seqc = fpl->dvp_seqc;
 4780         tvp = fpl->tvp;
 4781 
 4782         MPASS(*(cnp->cn_nameptr) != '/');
 4783 
 4784         if (cnp->cn_nameiop != LOOKUP) {
 4785                 return (cache_fplookup_final_modifying(fpl));
 4786         }
 4787 
 4788         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
 4789                 return (cache_fplookup_final_withparent(fpl));
 4790 
 4791         tvs = vget_prep_smr(tvp);
 4792         if (__predict_false(tvs == VGET_NONE)) {
 4793                 return (cache_fpl_partial(fpl));
 4794         }
 4795 
 4796         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4797                 cache_fpl_smr_exit(fpl);
 4798                 vget_abort(tvp, tvs);
 4799                 return (cache_fpl_aborted(fpl));
 4800         }
 4801 
 4802         cache_fpl_smr_exit(fpl);
 4803         return (cache_fplookup_final_child(fpl, tvs));
 4804 }
 4805 
 4806 /*
 4807  * Comment from locked lookup:
 4808  * Check for degenerate name (e.g. / or "") which is a way of talking about a
 4809  * directory, e.g. like "/." or ".".
 4810  */
 4811 static int __noinline
 4812 cache_fplookup_degenerate(struct cache_fpl *fpl)
 4813 {
 4814         struct componentname *cnp;
 4815         struct vnode *dvp;
 4816         enum vgetstate dvs;
 4817         int error, lkflags;
 4818 #ifdef INVARIANTS
 4819         char *cp;
 4820 #endif
 4821 
 4822         fpl->tvp = fpl->dvp;
 4823         fpl->tvp_seqc = fpl->dvp_seqc;
 4824 
 4825         cnp = fpl->cnp;
 4826         dvp = fpl->dvp;
 4827 
 4828 #ifdef INVARIANTS
 4829         for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
 4830                 KASSERT(*cp == '/',
 4831                     ("%s: encountered non-slash; string [%s]\n", __func__,
 4832                     cnp->cn_pnbuf));
 4833         }
 4834 #endif
 4835 
 4836         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
 4837                 cache_fpl_smr_exit(fpl);
 4838                 return (cache_fpl_handled_error(fpl, EISDIR));
 4839         }
 4840 
 4841         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
 4842                 return (cache_fplookup_final_withparent(fpl));
 4843         }
 4844 
 4845         dvs = vget_prep_smr(dvp);
 4846         cache_fpl_smr_exit(fpl);
 4847         if (__predict_false(dvs == VGET_NONE)) {
 4848                 return (cache_fpl_aborted(fpl));
 4849         }
 4850 
 4851         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4852                 lkflags = LK_SHARED;
 4853                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4854                         lkflags = LK_EXCLUSIVE;
 4855                 error = vget_finish(dvp, lkflags, dvs);
 4856                 if (__predict_false(error != 0)) {
 4857                         return (cache_fpl_aborted(fpl));
 4858                 }
 4859         } else {
 4860                 vget_finish_ref(dvp, dvs);
 4861         }
 4862         return (cache_fpl_handled(fpl));
 4863 }
 4864 
 4865 static int __noinline
 4866 cache_fplookup_emptypath(struct cache_fpl *fpl)
 4867 {
 4868         struct nameidata *ndp;
 4869         struct componentname *cnp;
 4870         enum vgetstate tvs;
 4871         struct vnode *tvp;
 4872         int error, lkflags;
 4873 
 4874         fpl->tvp = fpl->dvp;
 4875         fpl->tvp_seqc = fpl->dvp_seqc;
 4876 
 4877         ndp = fpl->ndp;
 4878         cnp = fpl->cnp;
 4879         tvp = fpl->tvp;
 4880 
 4881         MPASS(*cnp->cn_pnbuf == '\0');
 4882 
 4883         if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
 4884                 cache_fpl_smr_exit(fpl);
 4885                 return (cache_fpl_handled_error(fpl, ENOENT));
 4886         }
 4887 
 4888         MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
 4889 
 4890         tvs = vget_prep_smr(tvp);
 4891         cache_fpl_smr_exit(fpl);
 4892         if (__predict_false(tvs == VGET_NONE)) {
 4893                 return (cache_fpl_aborted(fpl));
 4894         }
 4895 
 4896         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4897                 lkflags = LK_SHARED;
 4898                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4899                         lkflags = LK_EXCLUSIVE;
 4900                 error = vget_finish(tvp, lkflags, tvs);
 4901                 if (__predict_false(error != 0)) {
 4902                         return (cache_fpl_aborted(fpl));
 4903                 }
 4904         } else {
 4905                 vget_finish_ref(tvp, tvs);
 4906         }
 4907 
 4908         ndp->ni_resflags |= NIRES_EMPTYPATH;
 4909         return (cache_fpl_handled(fpl));
 4910 }
 4911 
 4912 static int __noinline
 4913 cache_fplookup_noentry(struct cache_fpl *fpl)
 4914 {
 4915         struct nameidata *ndp;
 4916         struct componentname *cnp;
 4917         enum vgetstate dvs;
 4918         struct vnode *dvp, *tvp;
 4919         seqc_t dvp_seqc;
 4920         int error;
 4921 
 4922         ndp = fpl->ndp;
 4923         cnp = fpl->cnp;
 4924         dvp = fpl->dvp;
 4925         dvp_seqc = fpl->dvp_seqc;
 4926 
 4927         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4928         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4929         if (cnp->cn_nameiop == LOOKUP)
 4930                 MPASS((cnp->cn_flags & NOCACHE) == 0);
 4931         MPASS(!cache_fpl_isdotdot(cnp));
 4932 
 4933         /*
 4934          * Hack: delayed name len checking.
 4935          */
 4936         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 4937                 cache_fpl_smr_exit(fpl);
 4938                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 4939         }
 4940 
 4941         if (cnp->cn_nameptr[0] == '/') {
 4942                 return (cache_fplookup_skip_slashes(fpl));
 4943         }
 4944 
 4945         if (cnp->cn_pnbuf[0] == '\0') {
 4946                 return (cache_fplookup_emptypath(fpl));
 4947         }
 4948 
 4949         if (cnp->cn_nameptr[0] == '\0') {
 4950                 if (fpl->tvp == NULL) {
 4951                         return (cache_fplookup_degenerate(fpl));
 4952                 }
 4953                 return (cache_fplookup_trailingslash(fpl));
 4954         }
 4955 
 4956         if (cnp->cn_nameiop != LOOKUP) {
 4957                 fpl->tvp = NULL;
 4958                 return (cache_fplookup_modifying(fpl));
 4959         }
 4960 
 4961         /*
 4962          * Only try to fill in the component if it is the last one,
 4963          * otherwise not only there may be several to handle but the
 4964          * walk may be complicated.
 4965          */
 4966         if (!cache_fpl_islastcn(ndp)) {
 4967                 return (cache_fpl_partial(fpl));
 4968         }
 4969 
 4970         /*
 4971          * Regular lookup nulifies the slash, which we don't do here.
 4972          * Don't take chances with filesystem routines seeing it for
 4973          * the last entry.
 4974          */
 4975         if (cache_fpl_istrailingslash(fpl)) {
 4976                 return (cache_fpl_partial(fpl));
 4977         }
 4978 
 4979         /*
 4980          * Secure access to dvp; check cache_fplookup_partial_setup for
 4981          * reasoning.
 4982          */
 4983         dvs = vget_prep_smr(dvp);
 4984         cache_fpl_smr_exit(fpl);
 4985         if (__predict_false(dvs == VGET_NONE)) {
 4986                 return (cache_fpl_aborted(fpl));
 4987         }
 4988 
 4989         vget_finish_ref(dvp, dvs);
 4990         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4991                 vrele(dvp);
 4992                 return (cache_fpl_aborted(fpl));
 4993         }
 4994 
 4995         error = vn_lock(dvp, LK_SHARED);
 4996         if (__predict_false(error != 0)) {
 4997                 vrele(dvp);
 4998                 return (cache_fpl_aborted(fpl));
 4999         }
 5000 
 5001         tvp = NULL;
 5002         /*
 5003          * TODO: provide variants which don't require locking either vnode.
 5004          */
 5005         cnp->cn_flags |= ISLASTCN | MAKEENTRY;
 5006         cnp->cn_lkflags = LK_SHARED;
 5007         if ((cnp->cn_flags & LOCKSHARED) == 0) {
 5008                 cnp->cn_lkflags = LK_EXCLUSIVE;
 5009         }
 5010         error = VOP_LOOKUP(dvp, &tvp, cnp);
 5011         switch (error) {
 5012         case EJUSTRETURN:
 5013         case 0:
 5014                 break;
 5015         case ENOTDIR:
 5016         case ENOENT:
 5017                 vput(dvp);
 5018                 return (cache_fpl_handled_error(fpl, error));
 5019         default:
 5020                 vput(dvp);
 5021                 return (cache_fpl_aborted(fpl));
 5022         }
 5023 
 5024         fpl->tvp = tvp;
 5025 
 5026         if (tvp == NULL) {
 5027                 MPASS(error == EJUSTRETURN);
 5028                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 5029                         vput(dvp);
 5030                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 5031                         VOP_UNLOCK(dvp);
 5032                 }
 5033                 return (cache_fpl_handled(fpl));
 5034         }
 5035 
 5036         if (tvp->v_type == VLNK) {
 5037                 if ((cnp->cn_flags & FOLLOW) != 0) {
 5038                         vput(dvp);
 5039                         vput(tvp);
 5040                         return (cache_fpl_aborted(fpl));
 5041                 }
 5042         }
 5043 
 5044         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 5045                 vput(dvp);
 5046                 vput(tvp);
 5047                 return (cache_fpl_aborted(fpl));
 5048         }
 5049 
 5050         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 5051                 VOP_UNLOCK(tvp);
 5052         }
 5053 
 5054         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 5055                 vput(dvp);
 5056         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 5057                 VOP_UNLOCK(dvp);
 5058         }
 5059         return (cache_fpl_handled(fpl));
 5060 }
 5061 
 5062 static int __noinline
 5063 cache_fplookup_dot(struct cache_fpl *fpl)
 5064 {
 5065         int error;
 5066 
 5067         MPASS(!seqc_in_modify(fpl->dvp_seqc));
 5068         /*
 5069          * Just re-assign the value. seqc will be checked later for the first
 5070          * non-dot path component in line and/or before deciding to return the
 5071          * vnode.
 5072          */
 5073         fpl->tvp = fpl->dvp;
 5074         fpl->tvp_seqc = fpl->dvp_seqc;
 5075 
 5076         counter_u64_add(dothits, 1);
 5077         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
 5078 
 5079         error = 0;
 5080         if (cache_fplookup_is_mp(fpl)) {
 5081                 error = cache_fplookup_cross_mount(fpl);
 5082         }
 5083         return (error);
 5084 }
 5085 
 5086 static int __noinline
 5087 cache_fplookup_dotdot(struct cache_fpl *fpl)
 5088 {
 5089         struct nameidata *ndp;
 5090         struct componentname *cnp;
 5091         struct namecache *ncp;
 5092         struct vnode *dvp;
 5093         struct prison *pr;
 5094         u_char nc_flag;
 5095 
 5096         ndp = fpl->ndp;
 5097         cnp = fpl->cnp;
 5098         dvp = fpl->dvp;
 5099 
 5100         MPASS(cache_fpl_isdotdot(cnp));
 5101 
 5102         /*
 5103          * XXX this is racy the same way regular lookup is
 5104          */
 5105         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 5106             pr = pr->pr_parent)
 5107                 if (dvp == pr->pr_root)
 5108                         break;
 5109 
 5110         if (dvp == ndp->ni_rootdir ||
 5111             dvp == ndp->ni_topdir ||
 5112             dvp == rootvnode ||
 5113             pr != NULL) {
 5114                 fpl->tvp = dvp;
 5115                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
 5116                 if (seqc_in_modify(fpl->tvp_seqc)) {
 5117                         return (cache_fpl_aborted(fpl));
 5118                 }
 5119                 return (0);
 5120         }
 5121 
 5122         if ((dvp->v_vflag & VV_ROOT) != 0) {
 5123                 /*
 5124                  * TODO
 5125                  * The opposite of climb mount is needed here.
 5126                  */
 5127                 return (cache_fpl_partial(fpl));
 5128         }
 5129 
 5130         ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 5131         if (ncp == NULL) {
 5132                 return (cache_fpl_aborted(fpl));
 5133         }
 5134 
 5135         nc_flag = atomic_load_char(&ncp->nc_flag);
 5136         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 5137                 if ((nc_flag & NCF_NEGATIVE) != 0)
 5138                         return (cache_fpl_aborted(fpl));
 5139                 fpl->tvp = ncp->nc_vp;
 5140         } else {
 5141                 fpl->tvp = ncp->nc_dvp;
 5142         }
 5143 
 5144         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
 5145         if (seqc_in_modify(fpl->tvp_seqc)) {
 5146                 return (cache_fpl_partial(fpl));
 5147         }
 5148 
 5149         /*
 5150          * Acquire fence provided by vn_seqc_read_any above.
 5151          */
 5152         if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
 5153                 return (cache_fpl_aborted(fpl));
 5154         }
 5155 
 5156         if (!cache_ncp_canuse(ncp)) {
 5157                 return (cache_fpl_aborted(fpl));
 5158         }
 5159 
 5160         counter_u64_add(dotdothits, 1);
 5161         return (0);
 5162 }
 5163 
 5164 static int __noinline
 5165 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
 5166 {
 5167         u_char nc_flag __diagused;
 5168         bool neg_promote;
 5169 
 5170 #ifdef INVARIANTS
 5171         nc_flag = atomic_load_char(&ncp->nc_flag);
 5172         MPASS((nc_flag & NCF_NEGATIVE) != 0);
 5173 #endif
 5174         /*
 5175          * If they want to create an entry we need to replace this one.
 5176          */
 5177         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
 5178                 fpl->tvp = NULL;
 5179                 return (cache_fplookup_modifying(fpl));
 5180         }
 5181         neg_promote = cache_neg_hit_prep(ncp);
 5182         if (!cache_fpl_neg_ncp_canuse(ncp)) {
 5183                 cache_neg_hit_abort(ncp);
 5184                 return (cache_fpl_partial(fpl));
 5185         }
 5186         if (neg_promote) {
 5187                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
 5188         }
 5189         cache_neg_hit_finish(ncp);
 5190         cache_fpl_smr_exit(fpl);
 5191         return (cache_fpl_handled_error(fpl, ENOENT));
 5192 }
 5193 
 5194 /*
 5195  * Resolve a symlink. Called by filesystem-specific routines.
 5196  *
 5197  * Code flow is:
 5198  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
 5199  */
 5200 int
 5201 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
 5202 {
 5203         struct nameidata *ndp;
 5204         struct componentname *cnp;
 5205         size_t adjust;
 5206 
 5207         ndp = fpl->ndp;
 5208         cnp = fpl->cnp;
 5209 
 5210         if (__predict_false(len == 0)) {
 5211                 return (ENOENT);
 5212         }
 5213 
 5214         if (__predict_false(len > MAXPATHLEN - 2)) {
 5215                 if (cache_fpl_istrailingslash(fpl)) {
 5216                         return (EAGAIN);
 5217                 }
 5218         }
 5219 
 5220         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
 5221 #ifdef INVARIANTS
 5222         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 5223                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 5224                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 5225                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 5226         }
 5227 #endif
 5228 
 5229         if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
 5230                 return (ENAMETOOLONG);
 5231         }
 5232 
 5233         if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
 5234                 return (ELOOP);
 5235         }
 5236 
 5237         adjust = len;
 5238         if (ndp->ni_pathlen > 1) {
 5239                 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
 5240         } else {
 5241                 if (cache_fpl_istrailingslash(fpl)) {
 5242                         adjust = len + 1;
 5243                         cnp->cn_pnbuf[len] = '/';
 5244                         cnp->cn_pnbuf[len + 1] = '\0';
 5245                 } else {
 5246                         cnp->cn_pnbuf[len] = '\0';
 5247                 }
 5248         }
 5249         bcopy(string, cnp->cn_pnbuf, len);
 5250 
 5251         ndp->ni_pathlen += adjust;
 5252         cache_fpl_pathlen_add(fpl, adjust);
 5253         cnp->cn_nameptr = cnp->cn_pnbuf;
 5254         fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 5255         fpl->tvp = NULL;
 5256         return (0);
 5257 }
 5258 
 5259 static int __noinline
 5260 cache_fplookup_symlink(struct cache_fpl *fpl)
 5261 {
 5262         struct mount *mp;
 5263         struct nameidata *ndp;
 5264         struct componentname *cnp;
 5265         struct vnode *dvp, *tvp;
 5266         int error;
 5267 
 5268         ndp = fpl->ndp;
 5269         cnp = fpl->cnp;
 5270         dvp = fpl->dvp;
 5271         tvp = fpl->tvp;
 5272 
 5273         if (cache_fpl_islastcn(ndp)) {
 5274                 if ((cnp->cn_flags & FOLLOW) == 0) {
 5275                         return (cache_fplookup_final(fpl));
 5276                 }
 5277         }
 5278 
 5279         mp = atomic_load_ptr(&dvp->v_mount);
 5280         if (__predict_false(mp == NULL)) {
 5281                 return (cache_fpl_aborted(fpl));
 5282         }
 5283 
 5284         /*
 5285          * Note this check races against setting the flag just like regular
 5286          * lookup.
 5287          */
 5288         if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
 5289                 cache_fpl_smr_exit(fpl);
 5290                 return (cache_fpl_handled_error(fpl, EACCES));
 5291         }
 5292 
 5293         error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
 5294         if (__predict_false(error != 0)) {
 5295                 switch (error) {
 5296                 case EAGAIN:
 5297                         return (cache_fpl_partial(fpl));
 5298                 case ENOENT:
 5299                 case ENAMETOOLONG:
 5300                 case ELOOP:
 5301                         cache_fpl_smr_exit(fpl);
 5302                         return (cache_fpl_handled_error(fpl, error));
 5303                 default:
 5304                         return (cache_fpl_aborted(fpl));
 5305                 }
 5306         }
 5307 
 5308         if (*(cnp->cn_nameptr) == '/') {
 5309                 fpl->dvp = cache_fpl_handle_root(fpl);
 5310                 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 5311                 if (seqc_in_modify(fpl->dvp_seqc)) {
 5312                         return (cache_fpl_aborted(fpl));
 5313                 }
 5314                 /*
 5315                  * The main loop assumes that ->dvp points to a vnode belonging
 5316                  * to a filesystem which can do lockless lookup, but the absolute
 5317                  * symlink can be wandering off to one which does not.
 5318                  */
 5319                 mp = atomic_load_ptr(&fpl->dvp->v_mount);
 5320                 if (__predict_false(mp == NULL)) {
 5321                         return (cache_fpl_aborted(fpl));
 5322                 }
 5323                 if (!cache_fplookup_mp_supported(mp)) {
 5324                         cache_fpl_checkpoint(fpl);
 5325                         return (cache_fpl_partial(fpl));
 5326                 }
 5327         }
 5328         return (0);
 5329 }
 5330 
 5331 static int
 5332 cache_fplookup_next(struct cache_fpl *fpl)
 5333 {
 5334         struct componentname *cnp;
 5335         struct namecache *ncp;
 5336         struct vnode *dvp, *tvp;
 5337         u_char nc_flag;
 5338         uint32_t hash;
 5339         int error;
 5340 
 5341         cnp = fpl->cnp;
 5342         dvp = fpl->dvp;
 5343         hash = fpl->hash;
 5344 
 5345         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 5346                 if (cnp->cn_namelen == 1) {
 5347                         return (cache_fplookup_dot(fpl));
 5348                 }
 5349                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 5350                         return (cache_fplookup_dotdot(fpl));
 5351                 }
 5352         }
 5353 
 5354         MPASS(!cache_fpl_isdotdot(cnp));
 5355 
 5356         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 5357                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 5358                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 5359                         break;
 5360         }
 5361 
 5362         if (__predict_false(ncp == NULL)) {
 5363                 return (cache_fplookup_noentry(fpl));
 5364         }
 5365 
 5366         tvp = atomic_load_ptr(&ncp->nc_vp);
 5367         nc_flag = atomic_load_char(&ncp->nc_flag);
 5368         if ((nc_flag & NCF_NEGATIVE) != 0) {
 5369                 return (cache_fplookup_neg(fpl, ncp, hash));
 5370         }
 5371 
 5372         if (!cache_ncp_canuse(ncp)) {
 5373                 return (cache_fpl_partial(fpl));
 5374         }
 5375 
 5376         fpl->tvp = tvp;
 5377         fpl->tvp_seqc = vn_seqc_read_any(tvp);
 5378         if (seqc_in_modify(fpl->tvp_seqc)) {
 5379                 return (cache_fpl_partial(fpl));
 5380         }
 5381 
 5382         counter_u64_add(numposhits, 1);
 5383         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
 5384 
 5385         error = 0;
 5386         if (cache_fplookup_is_mp(fpl)) {
 5387                 error = cache_fplookup_cross_mount(fpl);
 5388         }
 5389         return (error);
 5390 }
 5391 
 5392 static bool
 5393 cache_fplookup_mp_supported(struct mount *mp)
 5394 {
 5395 
 5396         MPASS(mp != NULL);
 5397         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
 5398                 return (false);
 5399         return (true);
 5400 }
 5401 
 5402 /*
 5403  * Walk up the mount stack (if any).
 5404  *
 5405  * Correctness is provided in the following ways:
 5406  * - all vnodes are protected from freeing with SMR
 5407  * - struct mount objects are type stable making them always safe to access
 5408  * - stability of the particular mount is provided by busying it
 5409  * - relationship between the vnode which is mounted on and the mount is
 5410  *   verified with the vnode sequence counter after busying
 5411  * - association between root vnode of the mount and the mount is protected
 5412  *   by busy
 5413  *
 5414  * From that point on we can read the sequence counter of the root vnode
 5415  * and get the next mount on the stack (if any) using the same protection.
 5416  *
 5417  * By the end of successful walk we are guaranteed the reached state was
 5418  * indeed present at least at some point which matches the regular lookup.
 5419  */
 5420 static int __noinline
 5421 cache_fplookup_climb_mount(struct cache_fpl *fpl)
 5422 {
 5423         struct mount *mp, *prev_mp;
 5424         struct mount_pcpu *mpcpu, *prev_mpcpu;
 5425         struct vnode *vp;
 5426         seqc_t vp_seqc;
 5427 
 5428         vp = fpl->tvp;
 5429         vp_seqc = fpl->tvp_seqc;
 5430 
 5431         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
 5432         mp = atomic_load_ptr(&vp->v_mountedhere);
 5433         if (__predict_false(mp == NULL)) {
 5434                 return (0);
 5435         }
 5436 
 5437         prev_mp = NULL;
 5438         for (;;) {
 5439                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5440                         if (prev_mp != NULL)
 5441                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5442                         return (cache_fpl_partial(fpl));
 5443                 }
 5444                 if (prev_mp != NULL)
 5445                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5446                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 5447                         vfs_op_thread_exit_crit(mp, mpcpu);
 5448                         return (cache_fpl_partial(fpl));
 5449                 }
 5450                 if (!cache_fplookup_mp_supported(mp)) {
 5451                         vfs_op_thread_exit_crit(mp, mpcpu);
 5452                         return (cache_fpl_partial(fpl));
 5453                 }
 5454                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5455                 if (vp == NULL) {
 5456                         vfs_op_thread_exit_crit(mp, mpcpu);
 5457                         return (cache_fpl_partial(fpl));
 5458                 }
 5459                 vp_seqc = vn_seqc_read_any(vp);
 5460                 if (seqc_in_modify(vp_seqc)) {
 5461                         vfs_op_thread_exit_crit(mp, mpcpu);
 5462                         return (cache_fpl_partial(fpl));
 5463                 }
 5464                 prev_mp = mp;
 5465                 prev_mpcpu = mpcpu;
 5466                 mp = atomic_load_ptr(&vp->v_mountedhere);
 5467                 if (mp == NULL)
 5468                         break;
 5469         }
 5470 
 5471         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5472         fpl->tvp = vp;
 5473         fpl->tvp_seqc = vp_seqc;
 5474         return (0);
 5475 }
 5476 
 5477 static int __noinline
 5478 cache_fplookup_cross_mount(struct cache_fpl *fpl)
 5479 {
 5480         struct mount *mp;
 5481         struct mount_pcpu *mpcpu;
 5482         struct vnode *vp;
 5483         seqc_t vp_seqc;
 5484 
 5485         vp = fpl->tvp;
 5486         vp_seqc = fpl->tvp_seqc;
 5487 
 5488         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
 5489         mp = atomic_load_ptr(&vp->v_mountedhere);
 5490         if (__predict_false(mp == NULL)) {
 5491                 return (0);
 5492         }
 5493 
 5494         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5495                 return (cache_fpl_partial(fpl));
 5496         }
 5497         if (!vn_seqc_consistent(vp, vp_seqc)) {
 5498                 vfs_op_thread_exit_crit(mp, mpcpu);
 5499                 return (cache_fpl_partial(fpl));
 5500         }
 5501         if (!cache_fplookup_mp_supported(mp)) {
 5502                 vfs_op_thread_exit_crit(mp, mpcpu);
 5503                 return (cache_fpl_partial(fpl));
 5504         }
 5505         vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5506         if (__predict_false(vp == NULL)) {
 5507                 vfs_op_thread_exit_crit(mp, mpcpu);
 5508                 return (cache_fpl_partial(fpl));
 5509         }
 5510         vp_seqc = vn_seqc_read_any(vp);
 5511         vfs_op_thread_exit_crit(mp, mpcpu);
 5512         if (seqc_in_modify(vp_seqc)) {
 5513                 return (cache_fpl_partial(fpl));
 5514         }
 5515         mp = atomic_load_ptr(&vp->v_mountedhere);
 5516         if (__predict_false(mp != NULL)) {
 5517                 /*
 5518                  * There are possibly more mount points on top.
 5519                  * Normally this does not happen so for simplicity just start
 5520                  * over.
 5521                  */
 5522                 return (cache_fplookup_climb_mount(fpl));
 5523         }
 5524 
 5525         fpl->tvp = vp;
 5526         fpl->tvp_seqc = vp_seqc;
 5527         return (0);
 5528 }
 5529 
 5530 /*
 5531  * Check if a vnode is mounted on.
 5532  */
 5533 static bool
 5534 cache_fplookup_is_mp(struct cache_fpl *fpl)
 5535 {
 5536         struct vnode *vp;
 5537 
 5538         vp = fpl->tvp;
 5539         return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
 5540 }
 5541 
 5542 /*
 5543  * Parse the path.
 5544  *
 5545  * The code was originally copy-pasted from regular lookup and despite
 5546  * clean ups leaves performance on the table. Any modifications here
 5547  * must take into account that in case off fallback the resulting
 5548  * nameidata state has to be compatible with the original.
 5549  */
 5550 
 5551 /*
 5552  * Debug ni_pathlen tracking.
 5553  */
 5554 #ifdef INVARIANTS
 5555 static void
 5556 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5557 {
 5558 
 5559         fpl->debug.ni_pathlen += n;
 5560         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5561             ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5562 }
 5563 
 5564 static void
 5565 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5566 {
 5567 
 5568         fpl->debug.ni_pathlen -= n;
 5569         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5570             ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5571 }
 5572 
 5573 static void
 5574 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5575 {
 5576 
 5577         cache_fpl_pathlen_add(fpl, 1);
 5578 }
 5579 
 5580 static void
 5581 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5582 {
 5583 
 5584         cache_fpl_pathlen_sub(fpl, 1);
 5585 }
 5586 #else
 5587 static void
 5588 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5589 {
 5590 }
 5591 
 5592 static void
 5593 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5594 {
 5595 }
 5596 
 5597 static void
 5598 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5599 {
 5600 }
 5601 
 5602 static void
 5603 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5604 {
 5605 }
 5606 #endif
 5607 
 5608 static void
 5609 cache_fplookup_parse(struct cache_fpl *fpl)
 5610 {
 5611         struct nameidata *ndp;
 5612         struct componentname *cnp;
 5613         struct vnode *dvp;
 5614         char *cp;
 5615         uint32_t hash;
 5616 
 5617         ndp = fpl->ndp;
 5618         cnp = fpl->cnp;
 5619         dvp = fpl->dvp;
 5620 
 5621         /*
 5622          * Find the end of this path component, it is either / or nul.
 5623          *
 5624          * Store / as a temporary sentinel so that we only have one character
 5625          * to test for. Pathnames tend to be short so this should not be
 5626          * resulting in cache misses.
 5627          *
 5628          * TODO: fix this to be word-sized.
 5629          */
 5630         MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
 5631         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
 5632             ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
 5633             __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
 5634             fpl->nulchar, cnp->cn_pnbuf));
 5635         KASSERT(*fpl->nulchar == '\0',
 5636             ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
 5637             cnp->cn_pnbuf));
 5638         hash = cache_get_hash_iter_start(dvp);
 5639         *fpl->nulchar = '/';
 5640         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
 5641                 KASSERT(*cp != '\0',
 5642                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
 5643                     cnp->cn_nameptr));
 5644                 hash = cache_get_hash_iter(*cp, hash);
 5645                 continue;
 5646         }
 5647         *fpl->nulchar = '\0';
 5648         fpl->hash = cache_get_hash_iter_finish(hash);
 5649 
 5650         cnp->cn_namelen = cp - cnp->cn_nameptr;
 5651         cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
 5652 
 5653 #ifdef INVARIANTS
 5654         /*
 5655          * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
 5656          * we are going to fail this lookup with ENAMETOOLONG (see below).
 5657          */
 5658         if (cnp->cn_namelen <= NAME_MAX) {
 5659                 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
 5660                         panic("%s: mismatched hash for [%s] len %ld", __func__,
 5661                             cnp->cn_nameptr, cnp->cn_namelen);
 5662                 }
 5663         }
 5664 #endif
 5665 
 5666         /*
 5667          * Hack: we have to check if the found path component's length exceeds
 5668          * NAME_MAX. However, the condition is very rarely true and check can
 5669          * be elided in the common case -- if an entry was found in the cache,
 5670          * then it could not have been too long to begin with.
 5671          */
 5672         ndp->ni_next = cp;
 5673 }
 5674 
 5675 static void
 5676 cache_fplookup_parse_advance(struct cache_fpl *fpl)
 5677 {
 5678         struct nameidata *ndp;
 5679         struct componentname *cnp;
 5680 
 5681         ndp = fpl->ndp;
 5682         cnp = fpl->cnp;
 5683 
 5684         cnp->cn_nameptr = ndp->ni_next;
 5685         KASSERT(*(cnp->cn_nameptr) == '/',
 5686             ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
 5687             cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
 5688         cnp->cn_nameptr++;
 5689         cache_fpl_pathlen_dec(fpl);
 5690 }
 5691 
 5692 /*
 5693  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
 5694  *
 5695  * Lockless lookup tries to elide checking for spurious slashes and should they
 5696  * be present is guaranteed to fail to find an entry. In this case the caller
 5697  * must check if the name starts with a slash and call this routine.  It is
 5698  * going to fast forward across the spurious slashes and set the state up for
 5699  * retry.
 5700  */
 5701 static int __noinline
 5702 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
 5703 {
 5704         struct nameidata *ndp;
 5705         struct componentname *cnp;
 5706 
 5707         ndp = fpl->ndp;
 5708         cnp = fpl->cnp;
 5709 
 5710         MPASS(*(cnp->cn_nameptr) == '/');
 5711         do {
 5712                 cnp->cn_nameptr++;
 5713                 cache_fpl_pathlen_dec(fpl);
 5714         } while (*(cnp->cn_nameptr) == '/');
 5715 
 5716         /*
 5717          * Go back to one slash so that cache_fplookup_parse_advance has
 5718          * something to skip.
 5719          */
 5720         cnp->cn_nameptr--;
 5721         cache_fpl_pathlen_inc(fpl);
 5722 
 5723         /*
 5724          * cache_fplookup_parse_advance starts from ndp->ni_next
 5725          */
 5726         ndp->ni_next = cnp->cn_nameptr;
 5727 
 5728         /*
 5729          * See cache_fplookup_dot.
 5730          */
 5731         fpl->tvp = fpl->dvp;
 5732         fpl->tvp_seqc = fpl->dvp_seqc;
 5733 
 5734         return (0);
 5735 }
 5736 
 5737 /*
 5738  * Handle trailing slashes (e.g., "foo/").
 5739  *
 5740  * If a trailing slash is found the terminal vnode must be a directory.
 5741  * Regular lookup shortens the path by nulifying the first trailing slash and
 5742  * sets the TRAILINGSLASH flag to denote this took place. There are several
 5743  * checks on it performed later.
 5744  *
 5745  * Similarly to spurious slashes, lockless lookup handles this in a speculative
 5746  * manner relying on an invariant that a non-directory vnode will get a miss.
 5747  * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
 5748  *
 5749  * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
 5750  * and denotes this is the last path component, which avoids looping back.
 5751  *
 5752  * Only plain lookups are supported for now to restrict corner cases to handle.
 5753  */
 5754 static int __noinline
 5755 cache_fplookup_trailingslash(struct cache_fpl *fpl)
 5756 {
 5757 #ifdef INVARIANTS
 5758         size_t ni_pathlen;
 5759 #endif
 5760         struct nameidata *ndp;
 5761         struct componentname *cnp;
 5762         struct namecache *ncp;
 5763         struct vnode *tvp;
 5764         char *cn_nameptr_orig, *cn_nameptr_slash;
 5765         seqc_t tvp_seqc;
 5766         u_char nc_flag;
 5767 
 5768         ndp = fpl->ndp;
 5769         cnp = fpl->cnp;
 5770         tvp = fpl->tvp;
 5771         tvp_seqc = fpl->tvp_seqc;
 5772 
 5773         MPASS(fpl->dvp == fpl->tvp);
 5774         KASSERT(cache_fpl_istrailingslash(fpl),
 5775             ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
 5776             cnp->cn_pnbuf));
 5777         KASSERT(cnp->cn_nameptr[0] == '\0',
 5778             ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
 5779             cnp->cn_pnbuf));
 5780         KASSERT(cnp->cn_namelen == 0,
 5781             ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
 5782             cnp->cn_pnbuf));
 5783         MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
 5784 
 5785         if (cnp->cn_nameiop != LOOKUP) {
 5786                 return (cache_fpl_aborted(fpl));
 5787         }
 5788 
 5789         if (__predict_false(tvp->v_type != VDIR)) {
 5790                 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 5791                         return (cache_fpl_aborted(fpl));
 5792                 }
 5793                 cache_fpl_smr_exit(fpl);
 5794                 return (cache_fpl_handled_error(fpl, ENOTDIR));
 5795         }
 5796 
 5797         /*
 5798          * Denote the last component.
 5799          */
 5800         ndp->ni_next = &cnp->cn_nameptr[0];
 5801         MPASS(cache_fpl_islastcn(ndp));
 5802 
 5803         /*
 5804          * Unwind trailing slashes.
 5805          */
 5806         cn_nameptr_orig = cnp->cn_nameptr;
 5807         while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
 5808                 cnp->cn_nameptr--;
 5809                 if (cnp->cn_nameptr[0] != '/') {
 5810                         break;
 5811                 }
 5812         }
 5813 
 5814         /*
 5815          * Unwind to the beginning of the path component.
 5816          *
 5817          * Note the path may or may not have started with a slash.
 5818          */
 5819         cn_nameptr_slash = cnp->cn_nameptr;
 5820         while (cnp->cn_nameptr > cnp->cn_pnbuf) {
 5821                 cnp->cn_nameptr--;
 5822                 if (cnp->cn_nameptr[0] == '/') {
 5823                         break;
 5824                 }
 5825         }
 5826         if (cnp->cn_nameptr[0] == '/') {
 5827                 cnp->cn_nameptr++;
 5828         }
 5829 
 5830         cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
 5831         cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
 5832         cache_fpl_checkpoint(fpl);
 5833 
 5834 #ifdef INVARIANTS
 5835         ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 5836         if (ni_pathlen != fpl->debug.ni_pathlen) {
 5837                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 5838                     __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 5839                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 5840         }
 5841 #endif
 5842 
 5843         /*
 5844          * If this was a "./" lookup the parent directory is already correct.
 5845          */
 5846         if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
 5847                 return (0);
 5848         }
 5849 
 5850         /*
 5851          * Otherwise we need to look it up.
 5852          */
 5853         tvp = fpl->tvp;
 5854         ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
 5855         if (__predict_false(ncp == NULL)) {
 5856                 return (cache_fpl_aborted(fpl));
 5857         }
 5858         nc_flag = atomic_load_char(&ncp->nc_flag);
 5859         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 5860                 return (cache_fpl_aborted(fpl));
 5861         }
 5862         fpl->dvp = ncp->nc_dvp;
 5863         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 5864         if (seqc_in_modify(fpl->dvp_seqc)) {
 5865                 return (cache_fpl_aborted(fpl));
 5866         }
 5867         return (0);
 5868 }
 5869 
 5870 /*
 5871  * See the API contract for VOP_FPLOOKUP_VEXEC.
 5872  */
 5873 static int __noinline
 5874 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
 5875 {
 5876         struct componentname *cnp;
 5877         struct vnode *dvp;
 5878         seqc_t dvp_seqc;
 5879 
 5880         cnp = fpl->cnp;
 5881         dvp = fpl->dvp;
 5882         dvp_seqc = fpl->dvp_seqc;
 5883 
 5884         /*
 5885          * Hack: delayed empty path checking.
 5886          */
 5887         if (cnp->cn_pnbuf[0] == '\0') {
 5888                 return (cache_fplookup_emptypath(fpl));
 5889         }
 5890 
 5891         /*
 5892          * TODO: Due to ignoring trailing slashes lookup will perform a
 5893          * permission check on the last dir when it should not be doing it.  It
 5894          * may fail, but said failure should be ignored. It is possible to fix
 5895          * it up fully without resorting to regular lookup, but for now just
 5896          * abort.
 5897          */
 5898         if (cache_fpl_istrailingslash(fpl)) {
 5899                 return (cache_fpl_aborted(fpl));
 5900         }
 5901 
 5902         /*
 5903          * Hack: delayed degenerate path checking.
 5904          */
 5905         if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
 5906                 return (cache_fplookup_degenerate(fpl));
 5907         }
 5908 
 5909         /*
 5910          * Hack: delayed name len checking.
 5911          */
 5912         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 5913                 cache_fpl_smr_exit(fpl);
 5914                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 5915         }
 5916 
 5917         /*
 5918          * Hack: they may be looking up foo/bar, where foo is not a directory.
 5919          * In such a case we need to return ENOTDIR, but we may happen to get
 5920          * here with a different error.
 5921          */
 5922         if (dvp->v_type != VDIR) {
 5923                 error = ENOTDIR;
 5924         }
 5925 
 5926         /*
 5927          * Hack: handle O_SEARCH.
 5928          *
 5929          * Open Group Base Specifications Issue 7, 2018 edition states:
 5930          * <quote>
 5931          * If the access mode of the open file description associated with the
 5932          * file descriptor is not O_SEARCH, the function shall check whether
 5933          * directory searches are permitted using the current permissions of
 5934          * the directory underlying the file descriptor. If the access mode is
 5935          * O_SEARCH, the function shall not perform the check.
 5936          * </quote>
 5937          *
 5938          * Regular lookup tests for the NOEXECCHECK flag for every path
 5939          * component to decide whether to do the permission check. However,
 5940          * since most lookups never have the flag (and when they do it is only
 5941          * present for the first path component), lockless lookup only acts on
 5942          * it if there is a permission problem. Here the flag is represented
 5943          * with a boolean so that we don't have to clear it on the way out.
 5944          *
 5945          * For simplicity this always aborts.
 5946          * TODO: check if this is the first lookup and ignore the permission
 5947          * problem. Note the flag has to survive fallback (if it happens to be
 5948          * performed).
 5949          */
 5950         if (fpl->fsearch) {
 5951                 return (cache_fpl_aborted(fpl));
 5952         }
 5953 
 5954         switch (error) {
 5955         case EAGAIN:
 5956                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5957                         error = cache_fpl_aborted(fpl);
 5958                 } else {
 5959                         cache_fpl_partial(fpl);
 5960                 }
 5961                 break;
 5962         default:
 5963                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5964                         error = cache_fpl_aborted(fpl);
 5965                 } else {
 5966                         cache_fpl_smr_exit(fpl);
 5967                         cache_fpl_handled_error(fpl, error);
 5968                 }
 5969                 break;
 5970         }
 5971         return (error);
 5972 }
 5973 
 5974 static int
 5975 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 5976 {
 5977         struct nameidata *ndp;
 5978         struct componentname *cnp;
 5979         struct mount *mp;
 5980         int error;
 5981 
 5982         ndp = fpl->ndp;
 5983         cnp = fpl->cnp;
 5984 
 5985         cache_fpl_checkpoint(fpl);
 5986 
 5987         /*
 5988          * The vnode at hand is almost always stable, skip checking for it.
 5989          * Worst case this postpones the check towards the end of the iteration
 5990          * of the main loop.
 5991          */
 5992         fpl->dvp = dvp;
 5993         fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
 5994 
 5995         mp = atomic_load_ptr(&dvp->v_mount);
 5996         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
 5997                 return (cache_fpl_aborted(fpl));
 5998         }
 5999 
 6000         MPASS(fpl->tvp == NULL);
 6001 
 6002         for (;;) {
 6003                 cache_fplookup_parse(fpl);
 6004 
 6005                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
 6006                 if (__predict_false(error != 0)) {
 6007                         error = cache_fplookup_failed_vexec(fpl, error);
 6008                         break;
 6009                 }
 6010 
 6011                 error = cache_fplookup_next(fpl);
 6012                 if (__predict_false(cache_fpl_terminated(fpl))) {
 6013                         break;
 6014                 }
 6015 
 6016                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 6017 
 6018                 if (fpl->tvp->v_type == VLNK) {
 6019                         error = cache_fplookup_symlink(fpl);
 6020                         if (cache_fpl_terminated(fpl)) {
 6021                                 break;
 6022                         }
 6023                 } else {
 6024                         if (cache_fpl_islastcn(ndp)) {
 6025                                 error = cache_fplookup_final(fpl);
 6026                                 break;
 6027                         }
 6028 
 6029                         if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
 6030                                 error = cache_fpl_aborted(fpl);
 6031                                 break;
 6032                         }
 6033 
 6034                         fpl->dvp = fpl->tvp;
 6035                         fpl->dvp_seqc = fpl->tvp_seqc;
 6036                         cache_fplookup_parse_advance(fpl);
 6037                 }
 6038 
 6039                 cache_fpl_checkpoint(fpl);
 6040         }
 6041 
 6042         return (error);
 6043 }
 6044 
 6045 /*
 6046  * Fast path lookup protected with SMR and sequence counters.
 6047  *
 6048  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
 6049  *
 6050  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
 6051  * outlined below.
 6052  *
 6053  * Traditional vnode lookup conceptually looks like this:
 6054  *
 6055  * vn_lock(current);
 6056  * for (;;) {
 6057  *      next = find();
 6058  *      vn_lock(next);
 6059  *      vn_unlock(current);
 6060  *      current = next;
 6061  *      if (last)
 6062  *          break;
 6063  * }
 6064  * return (current);
 6065  *
 6066  * Each jump to the next vnode is safe memory-wise and atomic with respect to
 6067  * any modifications thanks to holding respective locks.
 6068  *
 6069  * The same guarantee can be provided with a combination of safe memory
 6070  * reclamation and sequence counters instead. If all operations which affect
 6071  * the relationship between the current vnode and the one we are looking for
 6072  * also modify the counter, we can verify whether all the conditions held as
 6073  * we made the jump. This includes things like permissions, mount points etc.
 6074  * Counter modification is provided by enclosing relevant places in
 6075  * vn_seqc_write_begin()/end() calls.
 6076  *
 6077  * Thus this translates to:
 6078  *
 6079  * vfs_smr_enter();
 6080  * dvp_seqc = seqc_read_any(dvp);
 6081  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
 6082  *     abort();
 6083  * for (;;) {
 6084  *      tvp = find();
 6085  *      tvp_seqc = seqc_read_any(tvp);
 6086  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
 6087  *          abort();
 6088  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
 6089  *          abort();
 6090  *      dvp = tvp; // we know nothing of importance has changed
 6091  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
 6092  *      if (last)
 6093  *          break;
 6094  * }
 6095  * vget(); // secure the vnode
 6096  * if (!seqc_consistent(tvp, tvp_seqc) // final check
 6097  *          abort();
 6098  * // at this point we know nothing has changed for any parent<->child pair
 6099  * // as they were crossed during the lookup, meaning we matched the guarantee
 6100  * // of the locked variant
 6101  * return (tvp);
 6102  *
 6103  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
 6104  * - they are called while within vfs_smr protection which they must never exit
 6105  * - EAGAIN can be returned to denote checking could not be performed, it is
 6106  *   always valid to return it
 6107  * - if the sequence counter has not changed the result must be valid
 6108  * - if the sequence counter has changed both false positives and false negatives
 6109  *   are permitted (since the result will be rejected later)
 6110  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
 6111  *
 6112  * Caveats to watch out for:
 6113  * - vnodes are passed unlocked and unreferenced with nothing stopping
 6114  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
 6115  *   to use atomic_load_ptr to fetch it.
 6116  * - the aforementioned object can also get freed, meaning absent other means it
 6117  *   should be protected with vfs_smr
 6118  * - either safely checking permissions as they are modified or guaranteeing
 6119  *   their stability is left to the routine
 6120  */
 6121 int
 6122 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
 6123     struct pwd **pwdp)
 6124 {
 6125         struct cache_fpl fpl;
 6126         struct pwd *pwd;
 6127         struct vnode *dvp;
 6128         struct componentname *cnp;
 6129         int error;
 6130 
 6131         fpl.status = CACHE_FPL_STATUS_UNSET;
 6132         fpl.in_smr = false;
 6133         fpl.ndp = ndp;
 6134         fpl.cnp = cnp = &ndp->ni_cnd;
 6135         MPASS(ndp->ni_lcf == 0);
 6136         KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 6137             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
 6138             cnp->cn_flags));
 6139         MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
 6140 
 6141         if (__predict_false(!cache_can_fplookup(&fpl))) {
 6142                 *status = fpl.status;
 6143                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 6144                 return (EOPNOTSUPP);
 6145         }
 6146 
 6147         cache_fpl_checkpoint_outer(&fpl);
 6148 
 6149         cache_fpl_smr_enter_initial(&fpl);
 6150 #ifdef INVARIANTS
 6151         fpl.debug.ni_pathlen = ndp->ni_pathlen;
 6152 #endif
 6153         fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 6154         fpl.fsearch = false;
 6155         fpl.tvp = NULL; /* for degenerate path handling */
 6156         fpl.pwd = pwdp;
 6157         pwd = pwd_get_smr();
 6158         *(fpl.pwd) = pwd;
 6159         ndp->ni_rootdir = pwd->pwd_rdir;
 6160         ndp->ni_topdir = pwd->pwd_jdir;
 6161 
 6162         if (cnp->cn_pnbuf[0] == '/') {
 6163                 dvp = cache_fpl_handle_root(&fpl);
 6164                 MPASS(ndp->ni_resflags == 0);
 6165                 ndp->ni_resflags = NIRES_ABS;
 6166         } else {
 6167                 if (ndp->ni_dirfd == AT_FDCWD) {
 6168                         dvp = pwd->pwd_cdir;
 6169                 } else {
 6170                         error = cache_fplookup_dirfd(&fpl, &dvp);
 6171                         if (__predict_false(error != 0)) {
 6172                                 goto out;
 6173                         }
 6174                 }
 6175         }
 6176 
 6177         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
 6178         error = cache_fplookup_impl(dvp, &fpl);
 6179 out:
 6180         cache_fpl_smr_assert_not_entered(&fpl);
 6181         cache_fpl_assert_status(&fpl);
 6182         *status = fpl.status;
 6183         if (SDT_PROBES_ENABLED()) {
 6184                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 6185                 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
 6186                         SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
 6187                             ndp);
 6188         }
 6189 
 6190         if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 6191                 MPASS(error != CACHE_FPL_FAILED);
 6192                 if (error != 0) {
 6193                         cache_fpl_cleanup_cnp(fpl.cnp);
 6194                         MPASS(fpl.dvp == NULL);
 6195                         MPASS(fpl.tvp == NULL);
 6196                 }
 6197                 ndp->ni_dvp = fpl.dvp;
 6198                 ndp->ni_vp = fpl.tvp;
 6199         }
 6200         return (error);
 6201 }

Cache object: dcff630c453ec0cdc0ad94c084061204


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.