vfs_cache.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993, 1995
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Poul-Henning Kamp of the FreeBSD Project.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_ddb.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/capsicum.h>
   46 #include <sys/counter.h>
   47 #include <sys/filedesc.h>
   48 #include <sys/fnv_hash.h>
   49 #include <sys/kernel.h>
   50 #include <sys/ktr.h>
   51 #include <sys/lock.h>
   52 #include <sys/malloc.h>
   53 #include <sys/fcntl.h>
   54 #include <sys/jail.h>
   55 #include <sys/mount.h>
   56 #include <sys/namei.h>
   57 #include <sys/proc.h>
   58 #include <sys/seqc.h>
   59 #include <sys/sdt.h>
   60 #include <sys/smr.h>
   61 #include <sys/smp.h>
   62 #include <sys/syscallsubr.h>
   63 #include <sys/sysctl.h>
   64 #include <sys/sysproto.h>
   65 #include <sys/vnode.h>
   66 #include <ck_queue.h>
   67 #ifdef KTRACE
   68 #include <sys/ktrace.h>
   69 #endif
   70 #ifdef INVARIANTS
   71 #include <machine/_inttypes.h>
   72 #endif
   73 
   74 #include <sys/capsicum.h>
   75 
   76 #include <security/audit/audit.h>
   77 #include <security/mac/mac_framework.h>
   78 
   79 #ifdef DDB
   80 #include <ddb/ddb.h>
   81 #endif
   82 
   83 #include <vm/uma.h>
   84 
   85 /*
   86  * High level overview of name caching in the VFS layer.
   87  *
   88  * Originally caching was implemented as part of UFS, later extracted to allow
   89  * use by other filesystems. A decision was made to make it optional and
   90  * completely detached from the rest of the kernel, which comes with limitations
   91  * outlined near the end of this comment block.
   92  *
   93  * This fundamental choice needs to be revisited. In the meantime, the current
   94  * state is described below. Significance of all notable routines is explained
   95  * in comments placed above their implementation. Scattered thoroughout the
   96  * file are TODO comments indicating shortcomings which can be fixed without
   97  * reworking everything (most of the fixes will likely be reusable). Various
   98  * details are omitted from this explanation to not clutter the overview, they
   99  * have to be checked by reading the code and associated commentary.
  100  *
  101  * Keep in mind that it's individual path components which are cached, not full
  102  * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
  103  * one for each name.
  104  *
  105  * I. Data organization
  106  *
  107  * Entries are described by "struct namecache" objects and stored in a hash
  108  * table. See cache_get_hash for more information.
  109  *
  110  * "struct vnode" contains pointers to source entries (names which can be found
  111  * when traversing through said vnode), destination entries (names of that
  112  * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
  113  * the parent vnode.
  114  *
  115  * The (directory vnode; name) tuple reliably determines the target entry if
  116  * it exists.
  117  *
  118  * Since there are no small locks at this time (all are 32 bytes in size on
  119  * LP64), the code works around the problem by introducing lock arrays to
  120  * protect hash buckets and vnode lists.
  121  *
  122  * II. Filesystem integration
  123  *
  124  * Filesystems participating in name caching do the following:
  125  * - set vop_lookup routine to vfs_cache_lookup
  126  * - set vop_cachedlookup to whatever can perform the lookup if the above fails
  127  * - if they support lockless lookup (see below), vop_fplookup_vexec and
  128  *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
  129  *   mount point
  130  * - call cache_purge or cache_vop_* routines to eliminate stale entries as
  131  *   applicable
  132  * - call cache_enter to add entries depending on the MAKEENTRY flag
  133  *
  134  * With the above in mind, there are 2 entry points when doing lookups:
  135  * - ... -> namei -> cache_fplookup -- this is the default
  136  * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
  137  *   should the above fail
  138  *
  139  * Example code flow how an entry is added:
  140  * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
  141  * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
  142  *
  143  * III. Performance considerations
  144  *
  145  * For lockless case forward lookup avoids any writes to shared areas apart
  146  * from the terminal path component. In other words non-modifying lookups of
  147  * different files don't suffer any scalability problems in the namecache.
  148  * Looking up the same file is limited by VFS and goes beyond the scope of this
  149  * file.
  150  *
  151  * At least on amd64 the single-threaded bottleneck for long paths is hashing
  152  * (see cache_get_hash). There are cases where the code issues acquire fence
  153  * multiple times, they can be combined on architectures which suffer from it.
  154  *
  155  * For locked case each encountered vnode has to be referenced and locked in
  156  * order to be handed out to the caller (normally that's namei). This
  157  * introduces significant hit single-threaded and serialization multi-threaded.
  158  *
  159  * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
  160  * avoids any writes to shared areas to any components.
  161  *
  162  * Unrelated insertions are partially serialized on updating the global entry
  163  * counter and possibly serialized on colliding bucket or vnode locks.
  164  *
  165  * IV. Observability
  166  *
  167  * Note not everything has an explicit dtrace probe nor it should have, thus
  168  * some of the one-liners below depend on implementation details.
  169  *
  170  * Examples:
  171  *
  172  * # Check what lookups failed to be handled in a lockless manner. Column 1 is
  173  * # line number, column 2 is status code (see cache_fpl_status)
  174  * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
  175  *
  176  * # Lengths of names added by binary name
  177  * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
  178  *
  179  * # Same as above but only those which exceed 64 characters
  180  * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
  181  *
  182  * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
  183  * # path is it
  184  * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
  185  *
  186  * V. Limitations and implementation defects
  187  *
  188  * - since it is possible there is no entry for an open file, tools like
  189  *   "procstat" may fail to resolve fd -> vnode -> path to anything
  190  * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
  191  *   shortage) in which case the above problem applies
  192  * - hardlinks are not tracked, thus if a vnode is reachable in more than one
  193  *   way, resolving a name may return a different path than the one used to
  194  *   open it (even if said path is still valid)
  195  * - by default entries are not added for newly created files
  196  * - adding an entry may need to evict negative entry first, which happens in 2
  197  *   distinct places (evicting on lookup, adding in a later VOP) making it
  198  *   impossible to simply reuse it
  199  * - there is a simple scheme to evict negative entries as the cache is approaching
  200  *   its capacity, but it is very unclear if doing so is a good idea to begin with
  201  * - vnodes are subject to being recycled even if target inode is left in memory,
  202  *   which loses the name cache entries when it perhaps should not. in case of tmpfs
  203  *   names get duplicated -- kept by filesystem itself and namecache separately
  204  * - struct namecache has a fixed size and comes in 2 variants, often wasting space.
  205  *   now hard to replace with malloc due to dependence on SMR.
  206  * - lack of better integration with the kernel also turns nullfs into a layered
  207  *   filesystem instead of something which can take advantage of caching
  208  */
  209 
  210 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  211     "Name cache");
  212 
  213 SDT_PROVIDER_DECLARE(vfs);
  214 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  215     "struct vnode *");
  216 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  217     "struct vnode *");
  218 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  219     "char *");
  220 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  221     "const char *");
  222 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  223     "struct namecache *", "int", "int");
  224 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  225 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  226     "char *", "struct vnode *");
  227 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  228 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  229     "struct vnode *", "char *");
  230 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  231     "struct vnode *");
  232 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
  233     "struct vnode *", "char *");
  234 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
  235     "char *");
  236 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
  237     "struct componentname *");
  238 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
  239     "struct componentname *");
  240 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
  241 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
  242 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
  243 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
  244 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
  245     "struct vnode *");
  246 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
  247     "char *");
  248 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
  249     "char *");
  250 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
  251 
  252 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
  253 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
  254 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
  255 
  256 static char __read_frequently cache_fast_lookup_enabled = true;
  257 
  258 /*
  259  * This structure describes the elements in the cache of recent
  260  * names looked up by namei.
  261  */
  262 struct negstate {
  263         u_char neg_flag;
  264         u_char neg_hit;
  265 };
  266 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
  267     "the state must fit in a union with a pointer without growing it");
  268 
  269 struct  namecache {
  270         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  271         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  272         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
  273         struct  vnode *nc_dvp;          /* vnode of parent of name */
  274         union {
  275                 struct  vnode *nu_vp;   /* vnode the name refers to */
  276                 struct  negstate nu_neg;/* negative entry state */
  277         } n_un;
  278         u_char  nc_flag;                /* flag bits */
  279         u_char  nc_nlen;                /* length of name */
  280         char    nc_name[0];             /* segment name + nul */
  281 };
  282 
  283 /*
  284  * struct namecache_ts repeats struct namecache layout up to the
  285  * nc_nlen member.
  286  * struct namecache_ts is used in place of struct namecache when time(s) need
  287  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  288  * both a non-dotdot directory name plus dotdot for the directory's
  289  * parent.
  290  *
  291  * See below for alignment requirement.
  292  */
  293 struct  namecache_ts {
  294         struct  timespec nc_time;       /* timespec provided by fs */
  295         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
  296         int     nc_ticks;               /* ticks value when entry was added */
  297         int     nc_pad;
  298         struct namecache nc_nc;
  299 };
  300 
  301 TAILQ_HEAD(cache_freebatch, namecache);
  302 
  303 /*
  304  * At least mips n32 performs 64-bit accesses to timespec as found
  305  * in namecache_ts and requires them to be aligned. Since others
  306  * may be in the same spot suffer a little bit and enforce the
  307  * alignment for everyone. Note this is a nop for 64-bit platforms.
  308  */
  309 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
  310 
  311 /*
  312  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
  313  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
  314  * smaller and the value was bumped to retain the total size, but it
  315  * was never re-evaluated for suitability. A simple test counting
  316  * lengths during package building shows that the value of 45 covers
  317  * about 86% of all added entries, reaching 99% at 65.
  318  *
  319  * Regardless of the above, use of dedicated zones instead of malloc may be
  320  * inducing additional waste. This may be hard to address as said zones are
  321  * tied to VFS SMR. Even if retaining them, the current split should be
  322  * re-evaluated.
  323  */
  324 #ifdef __LP64__
  325 #define CACHE_PATH_CUTOFF       45
  326 #define CACHE_LARGE_PAD         6
  327 #else
  328 #define CACHE_PATH_CUTOFF       41
  329 #define CACHE_LARGE_PAD         2
  330 #endif
  331 
  332 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
  333 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
  334 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
  335 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
  336 
  337 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  338 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  339 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  340 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  341 
  342 #define nc_vp           n_un.nu_vp
  343 #define nc_neg          n_un.nu_neg
  344 
  345 /*
  346  * Flags in namecache.nc_flag
  347  */
  348 #define NCF_WHITE       0x01
  349 #define NCF_ISDOTDOT    0x02
  350 #define NCF_TS          0x04
  351 #define NCF_DTS         0x08
  352 #define NCF_DVDROP      0x10
  353 #define NCF_NEGATIVE    0x20
  354 #define NCF_INVALID     0x40
  355 #define NCF_WIP         0x80
  356 
  357 /*
  358  * Flags in negstate.neg_flag
  359  */
  360 #define NEG_HOT         0x01
  361 
  362 static bool     cache_neg_evict_cond(u_long lnumcache);
  363 
  364 /*
  365  * Mark an entry as invalid.
  366  *
  367  * This is called before it starts getting deconstructed.
  368  */
  369 static void
  370 cache_ncp_invalidate(struct namecache *ncp)
  371 {
  372 
  373         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
  374             ("%s: entry %p already invalid", __func__, ncp));
  375         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
  376         atomic_thread_fence_rel();
  377 }
  378 
  379 /*
  380  * Check whether the entry can be safely used.
  381  *
  382  * All places which elide locks are supposed to call this after they are
  383  * done with reading from an entry.
  384  */
  385 #define cache_ncp_canuse(ncp)   ({                                      \
  386         struct namecache *_ncp = (ncp);                                 \
  387         u_char _nc_flag;                                                \
  388                                                                         \
  389         atomic_thread_fence_acq();                                      \
  390         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  391         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
  392 })
  393 
  394 /*
  395  * Like the above but also checks NCF_WHITE.
  396  */
  397 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
  398         struct namecache *_ncp = (ncp);                                 \
  399         u_char _nc_flag;                                                \
  400                                                                         \
  401         atomic_thread_fence_acq();                                      \
  402         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  403         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
  404 })
  405 
  406 VFS_SMR_DECLARE;
  407 
  408 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  409     "Name cache parameters");
  410 
  411 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
  412 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
  413     "Total namecache capacity");
  414 
  415 u_int ncsizefactor = 2;
  416 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
  417     "Size factor for namecache");
  418 
  419 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
  420 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
  421     "Ratio of negative namecache entries");
  422 
  423 /*
  424  * Negative entry % of namecache capacity above which automatic eviction is allowed.
  425  *
  426  * Check cache_neg_evict_cond for details.
  427  */
  428 static u_int ncnegminpct = 3;
  429 
  430 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
  431 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
  432     "Negative entry count above which automatic eviction is allowed");
  433 
  434 /*
  435  * Structures associated with name caching.
  436  */
  437 #define NCHHASH(hash) \
  438         (&nchashtbl[(hash) & nchash])
  439 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
  440 static u_long __read_mostly     nchash;                 /* size of hash table */
  441 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  442     "Size of namecache hash table");
  443 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
  444 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
  445 
  446 struct nchstats nchstats;               /* cache effectiveness statistics */
  447 
  448 static bool __read_mostly cache_rename_add = true;
  449 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW,
  450     &cache_rename_add, 0, "");
  451 
  452 static u_int __exclusive_cache_line neg_cycle;
  453 
  454 #define ncneghash       3
  455 #define numneglists     (ncneghash + 1)
  456 
  457 struct neglist {
  458         struct mtx              nl_evict_lock;
  459         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
  460         TAILQ_HEAD(, namecache) nl_list;
  461         TAILQ_HEAD(, namecache) nl_hotlist;
  462         u_long                  nl_hotnum;
  463 } __aligned(CACHE_LINE_SIZE);
  464 
  465 static struct neglist neglists[numneglists];
  466 
  467 static inline struct neglist *
  468 NCP2NEGLIST(struct namecache *ncp)
  469 {
  470 
  471         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
  472 }
  473 
  474 static inline struct negstate *
  475 NCP2NEGSTATE(struct namecache *ncp)
  476 {
  477 
  478         MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
  479         return (&ncp->nc_neg);
  480 }
  481 
  482 #define numbucketlocks (ncbuckethash + 1)
  483 static u_int __read_mostly  ncbuckethash;
  484 static struct mtx_padalign __read_mostly  *bucketlocks;
  485 #define HASH2BUCKETLOCK(hash) \
  486         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
  487 
  488 #define numvnodelocks (ncvnodehash + 1)
  489 static u_int __read_mostly  ncvnodehash;
  490 static struct mtx __read_mostly *vnodelocks;
  491 static inline struct mtx *
  492 VP2VNODELOCK(struct vnode *vp)
  493 {
  494 
  495         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
  496 }
  497 
  498 static void
  499 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
  500 {
  501         struct namecache_ts *ncp_ts;
  502 
  503         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
  504             (tsp == NULL && ticksp == NULL),
  505             ("No NCF_TS"));
  506 
  507         if (tsp == NULL)
  508                 return;
  509 
  510         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  511         *tsp = ncp_ts->nc_time;
  512         *ticksp = ncp_ts->nc_ticks;
  513 }
  514 
  515 #ifdef DEBUG_CACHE
  516 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
  517 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
  518     "VFS namecache enabled");
  519 #endif
  520 
  521 /* Export size information to userland */
  522 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
  523     sizeof(struct namecache), "sizeof(struct namecache)");
  524 
  525 /*
  526  * The new name cache statistics
  527  */
  528 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  529     "Name cache statistics");
  530 
  531 #define STATNODE_ULONG(name, varname, descr)                                    \
  532         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  533 #define STATNODE_COUNTER(name, varname, descr)                                  \
  534         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  535         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
  536             descr);
  537 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
  538 STATNODE_ULONG(count, numcache, "Number of cache entries");
  539 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
  540 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
  541 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
  542 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
  543 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
  544 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
  545 STATNODE_COUNTER(posszaps, numposzaps,
  546     "Number of cache hits (positive) we do not want to cache");
  547 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
  548 STATNODE_COUNTER(negzaps, numnegzaps,
  549     "Number of cache hits (negative) we do not want to cache");
  550 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
  551 /* These count for vn_getcwd(), too. */
  552 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
  553 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
  554 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
  555     "Number of fullpath search errors (VOP_VPTOCNP failures)");
  556 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
  557 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
  558 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
  559 
  560 /*
  561  * Debug or developer statistics.
  562  */
  563 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  564     "Name cache debugging");
  565 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
  566         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  567 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
  568         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  569         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
  570             descr);
  571 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
  572     "Number of successful removals after relocking");
  573 static long zap_bucket_fail;
  574 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
  575 static long zap_bucket_fail2;
  576 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
  577 static long cache_lock_vnodes_cel_3_failures;
  578 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
  579     "Number of times 3-way vnode locking failed");
  580 
  581 static void cache_zap_locked(struct namecache *ncp);
  582 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
  583     char **retbuf, size_t *buflen, size_t addend);
  584 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
  585     char **retbuf, size_t *buflen);
  586 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
  587     char **retbuf, size_t *len, size_t addend);
  588 
  589 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  590 
  591 static inline void
  592 cache_assert_vlp_locked(struct mtx *vlp)
  593 {
  594 
  595         if (vlp != NULL)
  596                 mtx_assert(vlp, MA_OWNED);
  597 }
  598 
  599 static inline void
  600 cache_assert_vnode_locked(struct vnode *vp)
  601 {
  602         struct mtx *vlp;
  603 
  604         vlp = VP2VNODELOCK(vp);
  605         cache_assert_vlp_locked(vlp);
  606 }
  607 
  608 /*
  609  * Directory vnodes with entries are held for two reasons:
  610  * 1. make them less of a target for reclamation in vnlru
  611  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
  612  *
  613  * It will be feasible to stop doing it altogether if all filesystems start
  614  * supporting lockless lookup.
  615  */
  616 static void
  617 cache_hold_vnode(struct vnode *vp)
  618 {
  619 
  620         cache_assert_vnode_locked(vp);
  621         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
  622         vhold(vp);
  623         counter_u64_add(numcachehv, 1);
  624 }
  625 
  626 static void
  627 cache_drop_vnode(struct vnode *vp)
  628 {
  629 
  630         /*
  631          * Called after all locks are dropped, meaning we can't assert
  632          * on the state of v_cache_src.
  633          */
  634         vdrop(vp);
  635         counter_u64_add(numcachehv, -1);
  636 }
  637 
  638 /*
  639  * UMA zones.
  640  */
  641 static uma_zone_t __read_mostly cache_zone_small;
  642 static uma_zone_t __read_mostly cache_zone_small_ts;
  643 static uma_zone_t __read_mostly cache_zone_large;
  644 static uma_zone_t __read_mostly cache_zone_large_ts;
  645 
  646 char *
  647 cache_symlink_alloc(size_t size, int flags)
  648 {
  649 
  650         if (size < CACHE_ZONE_SMALL_SIZE) {
  651                 return (uma_zalloc_smr(cache_zone_small, flags));
  652         }
  653         if (size < CACHE_ZONE_LARGE_SIZE) {
  654                 return (uma_zalloc_smr(cache_zone_large, flags));
  655         }
  656         counter_u64_add(symlinktoobig, 1);
  657         SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
  658         return (NULL);
  659 }
  660 
  661 void
  662 cache_symlink_free(char *string, size_t size)
  663 {
  664 
  665         MPASS(string != NULL);
  666         KASSERT(size < CACHE_ZONE_LARGE_SIZE,
  667             ("%s: size %zu too big", __func__, size));
  668 
  669         if (size < CACHE_ZONE_SMALL_SIZE) {
  670                 uma_zfree_smr(cache_zone_small, string);
  671                 return;
  672         }
  673         if (size < CACHE_ZONE_LARGE_SIZE) {
  674                 uma_zfree_smr(cache_zone_large, string);
  675                 return;
  676         }
  677         __assert_unreachable();
  678 }
  679 
  680 static struct namecache *
  681 cache_alloc_uma(int len, bool ts)
  682 {
  683         struct namecache_ts *ncp_ts;
  684         struct namecache *ncp;
  685 
  686         if (__predict_false(ts)) {
  687                 if (len <= CACHE_PATH_CUTOFF)
  688                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
  689                 else
  690                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
  691                 ncp = &ncp_ts->nc_nc;
  692         } else {
  693                 if (len <= CACHE_PATH_CUTOFF)
  694                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
  695                 else
  696                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
  697         }
  698         return (ncp);
  699 }
  700 
  701 static void
  702 cache_free_uma(struct namecache *ncp)
  703 {
  704         struct namecache_ts *ncp_ts;
  705 
  706         if (__predict_false(ncp->nc_flag & NCF_TS)) {
  707                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  708                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  709                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
  710                 else
  711                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
  712         } else {
  713                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  714                         uma_zfree_smr(cache_zone_small, ncp);
  715                 else
  716                         uma_zfree_smr(cache_zone_large, ncp);
  717         }
  718 }
  719 
  720 static struct namecache *
  721 cache_alloc(int len, bool ts)
  722 {
  723         u_long lnumcache;
  724 
  725         /*
  726          * Avoid blowout in namecache entries.
  727          *
  728          * Bugs:
  729          * 1. filesystems may end up trying to add an already existing entry
  730          * (for example this can happen after a cache miss during concurrent
  731          * lookup), in which case we will call cache_neg_evict despite not
  732          * adding anything.
  733          * 2. the routine may fail to free anything and no provisions are made
  734          * to make it try harder (see the inside for failure modes)
  735          * 3. it only ever looks at negative entries.
  736          */
  737         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
  738         if (cache_neg_evict_cond(lnumcache)) {
  739                 lnumcache = atomic_load_long(&numcache);
  740         }
  741         if (__predict_false(lnumcache >= ncsize)) {
  742                 atomic_subtract_long(&numcache, 1);
  743                 counter_u64_add(numdrops, 1);
  744                 return (NULL);
  745         }
  746         return (cache_alloc_uma(len, ts));
  747 }
  748 
  749 static void
  750 cache_free(struct namecache *ncp)
  751 {
  752 
  753         MPASS(ncp != NULL);
  754         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  755                 cache_drop_vnode(ncp->nc_dvp);
  756         }
  757         cache_free_uma(ncp);
  758         atomic_subtract_long(&numcache, 1);
  759 }
  760 
  761 static void
  762 cache_free_batch(struct cache_freebatch *batch)
  763 {
  764         struct namecache *ncp, *nnp;
  765         int i;
  766 
  767         i = 0;
  768         if (TAILQ_EMPTY(batch))
  769                 goto out;
  770         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
  771                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  772                         cache_drop_vnode(ncp->nc_dvp);
  773                 }
  774                 cache_free_uma(ncp);
  775                 i++;
  776         }
  777         atomic_subtract_long(&numcache, i);
  778 out:
  779         SDT_PROBE1(vfs, namecache, purge, batch, i);
  780 }
  781 
  782 /*
  783  * Hashing.
  784  *
  785  * The code was made to use FNV in 2001 and this choice needs to be revisited.
  786  *
  787  * Short summary of the difficulty:
  788  * The longest name which can be inserted is NAME_MAX characters in length (or
  789  * 255 at the time of writing this comment), while majority of names used in
  790  * practice are significantly shorter (mostly below 10). More importantly
  791  * majority of lookups performed find names are even shorter than that.
  792  *
  793  * This poses a problem where hashes which do better than FNV past word size
  794  * (or so) tend to come with additional overhead when finalizing the result,
  795  * making them noticeably slower for the most commonly used range.
  796  *
  797  * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
  798  *
  799  * When looking it up the most time consuming part by a large margin (at least
  800  * on amd64) is hashing.  Replacing FNV with something which pessimizes short
  801  * input would make the slowest part stand out even more.
  802  */
  803 
  804 /*
  805  * TODO: With the value stored we can do better than computing the hash based
  806  * on the address.
  807  */
  808 static void
  809 cache_prehash(struct vnode *vp)
  810 {
  811 
  812         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
  813 }
  814 
  815 static uint32_t
  816 cache_get_hash(char *name, u_char len, struct vnode *dvp)
  817 {
  818 
  819         return (fnv_32_buf(name, len, dvp->v_nchash));
  820 }
  821 
  822 static uint32_t
  823 cache_get_hash_iter_start(struct vnode *dvp)
  824 {
  825 
  826         return (dvp->v_nchash);
  827 }
  828 
  829 static uint32_t
  830 cache_get_hash_iter(char c, uint32_t hash)
  831 {
  832 
  833         return (fnv_32_buf(&c, 1, hash));
  834 }
  835 
  836 static uint32_t
  837 cache_get_hash_iter_finish(uint32_t hash)
  838 {
  839 
  840         return (hash);
  841 }
  842 
  843 static inline struct nchashhead *
  844 NCP2BUCKET(struct namecache *ncp)
  845 {
  846         uint32_t hash;
  847 
  848         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  849         return (NCHHASH(hash));
  850 }
  851 
  852 static inline struct mtx *
  853 NCP2BUCKETLOCK(struct namecache *ncp)
  854 {
  855         uint32_t hash;
  856 
  857         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  858         return (HASH2BUCKETLOCK(hash));
  859 }
  860 
  861 #ifdef INVARIANTS
  862 static void
  863 cache_assert_bucket_locked(struct namecache *ncp)
  864 {
  865         struct mtx *blp;
  866 
  867         blp = NCP2BUCKETLOCK(ncp);
  868         mtx_assert(blp, MA_OWNED);
  869 }
  870 
  871 static void
  872 cache_assert_bucket_unlocked(struct namecache *ncp)
  873 {
  874         struct mtx *blp;
  875 
  876         blp = NCP2BUCKETLOCK(ncp);
  877         mtx_assert(blp, MA_NOTOWNED);
  878 }
  879 #else
  880 #define cache_assert_bucket_locked(x) do { } while (0)
  881 #define cache_assert_bucket_unlocked(x) do { } while (0)
  882 #endif
  883 
  884 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
  885 static void
  886 _cache_sort_vnodes(void **p1, void **p2)
  887 {
  888         void *tmp;
  889 
  890         MPASS(*p1 != NULL || *p2 != NULL);
  891 
  892         if (*p1 > *p2) {
  893                 tmp = *p2;
  894                 *p2 = *p1;
  895                 *p1 = tmp;
  896         }
  897 }
  898 
  899 static void
  900 cache_lock_all_buckets(void)
  901 {
  902         u_int i;
  903 
  904         for (i = 0; i < numbucketlocks; i++)
  905                 mtx_lock(&bucketlocks[i]);
  906 }
  907 
  908 static void
  909 cache_unlock_all_buckets(void)
  910 {
  911         u_int i;
  912 
  913         for (i = 0; i < numbucketlocks; i++)
  914                 mtx_unlock(&bucketlocks[i]);
  915 }
  916 
  917 static void
  918 cache_lock_all_vnodes(void)
  919 {
  920         u_int i;
  921 
  922         for (i = 0; i < numvnodelocks; i++)
  923                 mtx_lock(&vnodelocks[i]);
  924 }
  925 
  926 static void
  927 cache_unlock_all_vnodes(void)
  928 {
  929         u_int i;
  930 
  931         for (i = 0; i < numvnodelocks; i++)
  932                 mtx_unlock(&vnodelocks[i]);
  933 }
  934 
  935 static int
  936 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  937 {
  938 
  939         cache_sort_vnodes(&vlp1, &vlp2);
  940 
  941         if (vlp1 != NULL) {
  942                 if (!mtx_trylock(vlp1))
  943                         return (EAGAIN);
  944         }
  945         if (!mtx_trylock(vlp2)) {
  946                 if (vlp1 != NULL)
  947                         mtx_unlock(vlp1);
  948                 return (EAGAIN);
  949         }
  950 
  951         return (0);
  952 }
  953 
  954 static void
  955 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  956 {
  957 
  958         MPASS(vlp1 != NULL || vlp2 != NULL);
  959         MPASS(vlp1 <= vlp2);
  960 
  961         if (vlp1 != NULL)
  962                 mtx_lock(vlp1);
  963         if (vlp2 != NULL)
  964                 mtx_lock(vlp2);
  965 }
  966 
  967 static void
  968 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  969 {
  970 
  971         MPASS(vlp1 != NULL || vlp2 != NULL);
  972 
  973         if (vlp1 != NULL)
  974                 mtx_unlock(vlp1);
  975         if (vlp2 != NULL)
  976                 mtx_unlock(vlp2);
  977 }
  978 
  979 static int
  980 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  981 {
  982         struct nchstats snap;
  983 
  984         if (req->oldptr == NULL)
  985                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
  986 
  987         snap = nchstats;
  988         snap.ncs_goodhits = counter_u64_fetch(numposhits);
  989         snap.ncs_neghits = counter_u64_fetch(numneghits);
  990         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
  991             counter_u64_fetch(numnegzaps);
  992         snap.ncs_miss = counter_u64_fetch(nummisszap) +
  993             counter_u64_fetch(nummiss);
  994 
  995         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
  996 }
  997 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
  998     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
  999     "VFS cache effectiveness statistics");
 1000 
 1001 static void
 1002 cache_recalc_neg_min(u_int val)
 1003 {
 1004 
 1005         neg_min = (ncsize * val) / 100;
 1006 }
 1007 
 1008 static int
 1009 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 1010 {
 1011         u_int val;
 1012         int error;
 1013 
 1014         val = ncnegminpct;
 1015         error = sysctl_handle_int(oidp, &val, 0, req);
 1016         if (error != 0 || req->newptr == NULL)
 1017                 return (error);
 1018 
 1019         if (val == ncnegminpct)
 1020                 return (0);
 1021         if (val < 0 || val > 99)
 1022                 return (EINVAL);
 1023         ncnegminpct = val;
 1024         cache_recalc_neg_min(val);
 1025         return (0);
 1026 }
 1027 
 1028 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 1029     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 1030     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 1031 
 1032 #ifdef DEBUG_CACHE
 1033 /*
 1034  * Grab an atomic snapshot of the name cache hash chain lengths
 1035  */
 1036 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 1037     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 1038     "hash table stats");
 1039 
 1040 static int
 1041 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 1042 {
 1043         struct nchashhead *ncpp;
 1044         struct namecache *ncp;
 1045         int i, error, n_nchash, *cntbuf;
 1046 
 1047 retry:
 1048         n_nchash = nchash + 1;  /* nchash is max index, not count */
 1049         if (req->oldptr == NULL)
 1050                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 1051         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 1052         cache_lock_all_buckets();
 1053         if (n_nchash != nchash + 1) {
 1054                 cache_unlock_all_buckets();
 1055                 free(cntbuf, M_TEMP);
 1056                 goto retry;
 1057         }
 1058         /* Scan hash tables counting entries */
 1059         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 1060                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 1061                         cntbuf[i]++;
 1062         cache_unlock_all_buckets();
 1063         for (error = 0, i = 0; i < n_nchash; i++)
 1064                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 1065                         break;
 1066         free(cntbuf, M_TEMP);
 1067         return (error);
 1068 }
 1069 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 1070     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 1071     "nchash chain lengths");
 1072 
 1073 static int
 1074 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 1075 {
 1076         int error;
 1077         struct nchashhead *ncpp;
 1078         struct namecache *ncp;
 1079         int n_nchash;
 1080         int count, maxlength, used, pct;
 1081 
 1082         if (!req->oldptr)
 1083                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 1084 
 1085         cache_lock_all_buckets();
 1086         n_nchash = nchash + 1;  /* nchash is max index, not count */
 1087         used = 0;
 1088         maxlength = 0;
 1089 
 1090         /* Scan hash tables for applicable entries */
 1091         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 1092                 count = 0;
 1093                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 1094                         count++;
 1095                 }
 1096                 if (count)
 1097                         used++;
 1098                 if (maxlength < count)
 1099                         maxlength = count;
 1100         }
 1101         n_nchash = nchash + 1;
 1102         cache_unlock_all_buckets();
 1103         pct = (used * 100) / (n_nchash / 100);
 1104         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 1105         if (error)
 1106                 return (error);
 1107         error = SYSCTL_OUT(req, &used, sizeof(used));
 1108         if (error)
 1109                 return (error);
 1110         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 1111         if (error)
 1112                 return (error);
 1113         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 1114         if (error)
 1115                 return (error);
 1116         return (0);
 1117 }
 1118 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 1119     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 1120     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 1121 #endif
 1122 
 1123 /*
 1124  * Negative entries management
 1125  *
 1126  * Various workloads create plenty of negative entries and barely use them
 1127  * afterwards. Moreover malicious users can keep performing bogus lookups
 1128  * adding even more entries. For example "make tinderbox" as of writing this
 1129  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 1130  * negative.
 1131  *
 1132  * As such, a rather aggressive eviction method is needed. The currently
 1133  * employed method is a placeholder.
 1134  *
 1135  * Entries are split over numneglists separate lists, each of which is further
 1136  * split into hot and cold entries. Entries get promoted after getting a hit.
 1137  * Eviction happens on addition of new entry.
 1138  */
 1139 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1140     "Name cache negative entry statistics");
 1141 
 1142 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 1143     "Number of negative cache entries");
 1144 
 1145 static COUNTER_U64_DEFINE_EARLY(neg_created);
 1146 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 1147     "Number of created negative entries");
 1148 
 1149 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 1150 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 1151     "Number of evicted negative entries");
 1152 
 1153 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 1154 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 1155     &neg_evict_skipped_empty,
 1156     "Number of times evicting failed due to lack of entries");
 1157 
 1158 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 1159 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 1160     &neg_evict_skipped_missed,
 1161     "Number of times evicting failed due to target entry disappearing");
 1162 
 1163 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 1164 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 1165     &neg_evict_skipped_contended,
 1166     "Number of times evicting failed due to contention");
 1167 
 1168 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 1169     "Number of cache hits (negative)");
 1170 
 1171 static int
 1172 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 1173 {
 1174         int i, out;
 1175 
 1176         out = 0;
 1177         for (i = 0; i < numneglists; i++)
 1178                 out += neglists[i].nl_hotnum;
 1179 
 1180         return (SYSCTL_OUT(req, &out, sizeof(out)));
 1181 }
 1182 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 1183     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 1184     "Number of hot negative entries");
 1185 
 1186 static void
 1187 cache_neg_init(struct namecache *ncp)
 1188 {
 1189         struct negstate *ns;
 1190 
 1191         ncp->nc_flag |= NCF_NEGATIVE;
 1192         ns = NCP2NEGSTATE(ncp);
 1193         ns->neg_flag = 0;
 1194         ns->neg_hit = 0;
 1195         counter_u64_add(neg_created, 1);
 1196 }
 1197 
 1198 #define CACHE_NEG_PROMOTION_THRESH 2
 1199 
 1200 static bool
 1201 cache_neg_hit_prep(struct namecache *ncp)
 1202 {
 1203         struct negstate *ns;
 1204         u_char n;
 1205 
 1206         ns = NCP2NEGSTATE(ncp);
 1207         n = atomic_load_char(&ns->neg_hit);
 1208         for (;;) {
 1209                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 1210                         return (false);
 1211                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 1212                         break;
 1213         }
 1214         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 1215 }
 1216 
 1217 /*
 1218  * Nothing to do here but it is provided for completeness as some
 1219  * cache_neg_hit_prep callers may end up returning without even
 1220  * trying to promote.
 1221  */
 1222 #define cache_neg_hit_abort(ncp)        do { } while (0)
 1223 
 1224 static void
 1225 cache_neg_hit_finish(struct namecache *ncp)
 1226 {
 1227 
 1228         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 1229         counter_u64_add(numneghits, 1);
 1230 }
 1231 
 1232 /*
 1233  * Move a negative entry to the hot list.
 1234  */
 1235 static void
 1236 cache_neg_promote_locked(struct namecache *ncp)
 1237 {
 1238         struct neglist *nl;
 1239         struct negstate *ns;
 1240 
 1241         ns = NCP2NEGSTATE(ncp);
 1242         nl = NCP2NEGLIST(ncp);
 1243         mtx_assert(&nl->nl_lock, MA_OWNED);
 1244         if ((ns->neg_flag & NEG_HOT) == 0) {
 1245                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1246                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 1247                 nl->nl_hotnum++;
 1248                 ns->neg_flag |= NEG_HOT;
 1249         }
 1250 }
 1251 
 1252 /*
 1253  * Move a hot negative entry to the cold list.
 1254  */
 1255 static void
 1256 cache_neg_demote_locked(struct namecache *ncp)
 1257 {
 1258         struct neglist *nl;
 1259         struct negstate *ns;
 1260 
 1261         ns = NCP2NEGSTATE(ncp);
 1262         nl = NCP2NEGLIST(ncp);
 1263         mtx_assert(&nl->nl_lock, MA_OWNED);
 1264         MPASS(ns->neg_flag & NEG_HOT);
 1265         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1266         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1267         nl->nl_hotnum--;
 1268         ns->neg_flag &= ~NEG_HOT;
 1269         atomic_store_char(&ns->neg_hit, 0);
 1270 }
 1271 
 1272 /*
 1273  * Move a negative entry to the hot list if it matches the lookup.
 1274  *
 1275  * We have to take locks, but they may be contended and in the worst
 1276  * case we may need to go off CPU. We don't want to spin within the
 1277  * smr section and we can't block with it. Exiting the section means
 1278  * the found entry could have been evicted. We are going to look it
 1279  * up again.
 1280  */
 1281 static bool
 1282 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
 1283     struct namecache *oncp, uint32_t hash)
 1284 {
 1285         struct namecache *ncp;
 1286         struct neglist *nl;
 1287         u_char nc_flag;
 1288 
 1289         nl = NCP2NEGLIST(oncp);
 1290 
 1291         mtx_lock(&nl->nl_lock);
 1292         /*
 1293          * For hash iteration.
 1294          */
 1295         vfs_smr_enter();
 1296 
 1297         /*
 1298          * Avoid all surprises by only succeeding if we got the same entry and
 1299          * bailing completely otherwise.
 1300          * XXX There are no provisions to keep the vnode around, meaning we may
 1301          * end up promoting a negative entry for a *new* vnode and returning
 1302          * ENOENT on its account. This is the error we want to return anyway
 1303          * and promotion is harmless.
 1304          *
 1305          * In particular at this point there can be a new ncp which matches the
 1306          * search but hashes to a different neglist.
 1307          */
 1308         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1309                 if (ncp == oncp)
 1310                         break;
 1311         }
 1312 
 1313         /*
 1314          * No match to begin with.
 1315          */
 1316         if (__predict_false(ncp == NULL)) {
 1317                 goto out_abort;
 1318         }
 1319 
 1320         /*
 1321          * The newly found entry may be something different...
 1322          */
 1323         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1324             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
 1325                 goto out_abort;
 1326         }
 1327 
 1328         /*
 1329          * ... and not even negative.
 1330          */
 1331         nc_flag = atomic_load_char(&ncp->nc_flag);
 1332         if ((nc_flag & NCF_NEGATIVE) == 0) {
 1333                 goto out_abort;
 1334         }
 1335 
 1336         if (!cache_ncp_canuse(ncp)) {
 1337                 goto out_abort;
 1338         }
 1339 
 1340         cache_neg_promote_locked(ncp);
 1341         cache_neg_hit_finish(ncp);
 1342         vfs_smr_exit();
 1343         mtx_unlock(&nl->nl_lock);
 1344         return (true);
 1345 out_abort:
 1346         vfs_smr_exit();
 1347         mtx_unlock(&nl->nl_lock);
 1348         return (false);
 1349 }
 1350 
 1351 static void
 1352 cache_neg_promote(struct namecache *ncp)
 1353 {
 1354         struct neglist *nl;
 1355 
 1356         nl = NCP2NEGLIST(ncp);
 1357         mtx_lock(&nl->nl_lock);
 1358         cache_neg_promote_locked(ncp);
 1359         mtx_unlock(&nl->nl_lock);
 1360 }
 1361 
 1362 static void
 1363 cache_neg_insert(struct namecache *ncp)
 1364 {
 1365         struct neglist *nl;
 1366 
 1367         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1368         cache_assert_bucket_locked(ncp);
 1369         nl = NCP2NEGLIST(ncp);
 1370         mtx_lock(&nl->nl_lock);
 1371         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1372         mtx_unlock(&nl->nl_lock);
 1373         atomic_add_long(&numneg, 1);
 1374 }
 1375 
 1376 static void
 1377 cache_neg_remove(struct namecache *ncp)
 1378 {
 1379         struct neglist *nl;
 1380         struct negstate *ns;
 1381 
 1382         cache_assert_bucket_locked(ncp);
 1383         nl = NCP2NEGLIST(ncp);
 1384         ns = NCP2NEGSTATE(ncp);
 1385         mtx_lock(&nl->nl_lock);
 1386         if ((ns->neg_flag & NEG_HOT) != 0) {
 1387                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1388                 nl->nl_hotnum--;
 1389         } else {
 1390                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1391         }
 1392         mtx_unlock(&nl->nl_lock);
 1393         atomic_subtract_long(&numneg, 1);
 1394 }
 1395 
 1396 static struct neglist *
 1397 cache_neg_evict_select_list(void)
 1398 {
 1399         struct neglist *nl;
 1400         u_int c;
 1401 
 1402         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
 1403         nl = &neglists[c % numneglists];
 1404         if (!mtx_trylock(&nl->nl_evict_lock)) {
 1405                 counter_u64_add(neg_evict_skipped_contended, 1);
 1406                 return (NULL);
 1407         }
 1408         return (nl);
 1409 }
 1410 
 1411 static struct namecache *
 1412 cache_neg_evict_select_entry(struct neglist *nl)
 1413 {
 1414         struct namecache *ncp, *lncp;
 1415         struct negstate *ns, *lns;
 1416         int i;
 1417 
 1418         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
 1419         mtx_assert(&nl->nl_lock, MA_OWNED);
 1420         ncp = TAILQ_FIRST(&nl->nl_list);
 1421         if (ncp == NULL)
 1422                 return (NULL);
 1423         lncp = ncp;
 1424         lns = NCP2NEGSTATE(lncp);
 1425         for (i = 1; i < 4; i++) {
 1426                 ncp = TAILQ_NEXT(ncp, nc_dst);
 1427                 if (ncp == NULL)
 1428                         break;
 1429                 ns = NCP2NEGSTATE(ncp);
 1430                 if (ns->neg_hit < lns->neg_hit) {
 1431                         lncp = ncp;
 1432                         lns = ns;
 1433                 }
 1434         }
 1435         return (lncp);
 1436 }
 1437 
 1438 static bool
 1439 cache_neg_evict(void)
 1440 {
 1441         struct namecache *ncp, *ncp2;
 1442         struct neglist *nl;
 1443         struct vnode *dvp;
 1444         struct mtx *dvlp;
 1445         struct mtx *blp;
 1446         uint32_t hash;
 1447         u_char nlen;
 1448         bool evicted;
 1449 
 1450         nl = cache_neg_evict_select_list();
 1451         if (nl == NULL) {
 1452                 return (false);
 1453         }
 1454 
 1455         mtx_lock(&nl->nl_lock);
 1456         ncp = TAILQ_FIRST(&nl->nl_hotlist);
 1457         if (ncp != NULL) {
 1458                 cache_neg_demote_locked(ncp);
 1459         }
 1460         ncp = cache_neg_evict_select_entry(nl);
 1461         if (ncp == NULL) {
 1462                 counter_u64_add(neg_evict_skipped_empty, 1);
 1463                 mtx_unlock(&nl->nl_lock);
 1464                 mtx_unlock(&nl->nl_evict_lock);
 1465                 return (false);
 1466         }
 1467         nlen = ncp->nc_nlen;
 1468         dvp = ncp->nc_dvp;
 1469         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
 1470         dvlp = VP2VNODELOCK(dvp);
 1471         blp = HASH2BUCKETLOCK(hash);
 1472         mtx_unlock(&nl->nl_lock);
 1473         mtx_unlock(&nl->nl_evict_lock);
 1474         mtx_lock(dvlp);
 1475         mtx_lock(blp);
 1476         /*
 1477          * Note that since all locks were dropped above, the entry may be
 1478          * gone or reallocated to be something else.
 1479          */
 1480         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
 1481                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
 1482                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
 1483                         break;
 1484         }
 1485         if (ncp2 == NULL) {
 1486                 counter_u64_add(neg_evict_skipped_missed, 1);
 1487                 ncp = NULL;
 1488                 evicted = false;
 1489         } else {
 1490                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
 1491                 MPASS(blp == NCP2BUCKETLOCK(ncp));
 1492                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
 1493                     ncp->nc_name);
 1494                 cache_zap_locked(ncp);
 1495                 counter_u64_add(neg_evicted, 1);
 1496                 evicted = true;
 1497         }
 1498         mtx_unlock(blp);
 1499         mtx_unlock(dvlp);
 1500         if (ncp != NULL)
 1501                 cache_free(ncp);
 1502         return (evicted);
 1503 }
 1504 
 1505 /*
 1506  * Maybe evict a negative entry to create more room.
 1507  *
 1508  * The ncnegfactor parameter limits what fraction of the total count
 1509  * can comprise of negative entries. However, if the cache is just
 1510  * warming up this leads to excessive evictions.  As such, ncnegminpct
 1511  * (recomputed to neg_min) dictates whether the above should be
 1512  * applied.
 1513  *
 1514  * Try evicting if the cache is close to full capacity regardless of
 1515  * other considerations.
 1516  */
 1517 static bool
 1518 cache_neg_evict_cond(u_long lnumcache)
 1519 {
 1520         u_long lnumneg;
 1521 
 1522         if (ncsize - 1000 < lnumcache)
 1523                 goto out_evict;
 1524         lnumneg = atomic_load_long(&numneg);
 1525         if (lnumneg < neg_min)
 1526                 return (false);
 1527         if (lnumneg * ncnegfactor < lnumcache)
 1528                 return (false);
 1529 out_evict:
 1530         return (cache_neg_evict());
 1531 }
 1532 
 1533 /*
 1534  * cache_zap_locked():
 1535  *
 1536  *   Removes a namecache entry from cache, whether it contains an actual
 1537  *   pointer to a vnode or if it is just a negative cache entry.
 1538  */
 1539 static void
 1540 cache_zap_locked(struct namecache *ncp)
 1541 {
 1542         struct nchashhead *ncpp;
 1543         struct vnode *dvp, *vp;
 1544 
 1545         dvp = ncp->nc_dvp;
 1546         vp = ncp->nc_vp;
 1547 
 1548         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1549                 cache_assert_vnode_locked(vp);
 1550         cache_assert_vnode_locked(dvp);
 1551         cache_assert_bucket_locked(ncp);
 1552 
 1553         cache_ncp_invalidate(ncp);
 1554 
 1555         ncpp = NCP2BUCKET(ncp);
 1556         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 1557         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1558                 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
 1559                 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
 1560                 if (ncp == vp->v_cache_dd) {
 1561                         atomic_store_ptr(&vp->v_cache_dd, NULL);
 1562                 }
 1563         } else {
 1564                 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
 1565                 cache_neg_remove(ncp);
 1566         }
 1567         if (ncp->nc_flag & NCF_ISDOTDOT) {
 1568                 if (ncp == dvp->v_cache_dd) {
 1569                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1570                 }
 1571         } else {
 1572                 LIST_REMOVE(ncp, nc_src);
 1573                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 1574                         ncp->nc_flag |= NCF_DVDROP;
 1575                 }
 1576         }
 1577 }
 1578 
 1579 static void
 1580 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 1581 {
 1582         struct mtx *blp;
 1583 
 1584         MPASS(ncp->nc_dvp == vp);
 1585         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1586         cache_assert_vnode_locked(vp);
 1587 
 1588         blp = NCP2BUCKETLOCK(ncp);
 1589         mtx_lock(blp);
 1590         cache_zap_locked(ncp);
 1591         mtx_unlock(blp);
 1592 }
 1593 
 1594 static bool
 1595 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
 1596     struct mtx **vlpp)
 1597 {
 1598         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 1599         struct mtx *blp;
 1600 
 1601         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 1602         cache_assert_vnode_locked(vp);
 1603 
 1604         if (ncp->nc_flag & NCF_NEGATIVE) {
 1605                 if (*vlpp != NULL) {
 1606                         mtx_unlock(*vlpp);
 1607                         *vlpp = NULL;
 1608                 }
 1609                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 1610                 return (true);
 1611         }
 1612 
 1613         pvlp = VP2VNODELOCK(vp);
 1614         blp = NCP2BUCKETLOCK(ncp);
 1615         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
 1616         vlp2 = VP2VNODELOCK(ncp->nc_vp);
 1617 
 1618         if (*vlpp == vlp1 || *vlpp == vlp2) {
 1619                 to_unlock = *vlpp;
 1620                 *vlpp = NULL;
 1621         } else {
 1622                 if (*vlpp != NULL) {
 1623                         mtx_unlock(*vlpp);
 1624                         *vlpp = NULL;
 1625                 }
 1626                 cache_sort_vnodes(&vlp1, &vlp2);
 1627                 if (vlp1 == pvlp) {
 1628                         mtx_lock(vlp2);
 1629                         to_unlock = vlp2;
 1630                 } else {
 1631                         if (!mtx_trylock(vlp1))
 1632                                 goto out_relock;
 1633                         to_unlock = vlp1;
 1634                 }
 1635         }
 1636         mtx_lock(blp);
 1637         cache_zap_locked(ncp);
 1638         mtx_unlock(blp);
 1639         if (to_unlock != NULL)
 1640                 mtx_unlock(to_unlock);
 1641         return (true);
 1642 
 1643 out_relock:
 1644         mtx_unlock(vlp2);
 1645         mtx_lock(vlp1);
 1646         mtx_lock(vlp2);
 1647         MPASS(*vlpp == NULL);
 1648         *vlpp = vlp1;
 1649         return (false);
 1650 }
 1651 
 1652 /*
 1653  * If trylocking failed we can get here. We know enough to take all needed locks
 1654  * in the right order and re-lookup the entry.
 1655  */
 1656 static int
 1657 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
 1658     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
 1659     struct mtx *blp)
 1660 {
 1661         struct namecache *rncp;
 1662 
 1663         cache_assert_bucket_unlocked(ncp);
 1664 
 1665         cache_sort_vnodes(&dvlp, &vlp);
 1666         cache_lock_vnodes(dvlp, vlp);
 1667         mtx_lock(blp);
 1668         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 1669                 if (rncp == ncp && rncp->nc_dvp == dvp &&
 1670                     rncp->nc_nlen == cnp->cn_namelen &&
 1671                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
 1672                         break;
 1673         }
 1674         if (rncp != NULL) {
 1675                 cache_zap_locked(rncp);
 1676                 mtx_unlock(blp);
 1677                 cache_unlock_vnodes(dvlp, vlp);
 1678                 counter_u64_add(zap_bucket_relock_success, 1);
 1679                 return (0);
 1680         }
 1681 
 1682         mtx_unlock(blp);
 1683         cache_unlock_vnodes(dvlp, vlp);
 1684         return (EAGAIN);
 1685 }
 1686 
 1687 static int __noinline
 1688 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
 1689     uint32_t hash, struct mtx *blp)
 1690 {
 1691         struct mtx *dvlp, *vlp;
 1692         struct vnode *dvp;
 1693 
 1694         cache_assert_bucket_locked(ncp);
 1695 
 1696         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1697         vlp = NULL;
 1698         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1699                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1700         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1701                 cache_zap_locked(ncp);
 1702                 mtx_unlock(blp);
 1703                 cache_unlock_vnodes(dvlp, vlp);
 1704                 return (0);
 1705         }
 1706 
 1707         dvp = ncp->nc_dvp;
 1708         mtx_unlock(blp);
 1709         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 1710 }
 1711 
 1712 static __noinline int
 1713 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
 1714 {
 1715         struct namecache *ncp;
 1716         struct mtx *blp;
 1717         struct mtx *dvlp, *dvlp2;
 1718         uint32_t hash;
 1719         int error;
 1720 
 1721         if (cnp->cn_namelen == 2 &&
 1722             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1723                 dvlp = VP2VNODELOCK(dvp);
 1724                 dvlp2 = NULL;
 1725                 mtx_lock(dvlp);
 1726 retry_dotdot:
 1727                 ncp = dvp->v_cache_dd;
 1728                 if (ncp == NULL) {
 1729                         mtx_unlock(dvlp);
 1730                         if (dvlp2 != NULL)
 1731                                 mtx_unlock(dvlp2);
 1732                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1733                         return (0);
 1734                 }
 1735                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1736                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
 1737                                 goto retry_dotdot;
 1738                         MPASS(dvp->v_cache_dd == NULL);
 1739                         mtx_unlock(dvlp);
 1740                         if (dvlp2 != NULL)
 1741                                 mtx_unlock(dvlp2);
 1742                         cache_free(ncp);
 1743                 } else {
 1744                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1745                         mtx_unlock(dvlp);
 1746                         if (dvlp2 != NULL)
 1747                                 mtx_unlock(dvlp2);
 1748                 }
 1749                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1750                 return (1);
 1751         }
 1752 
 1753         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1754         blp = HASH2BUCKETLOCK(hash);
 1755 retry:
 1756         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 1757                 goto out_no_entry;
 1758 
 1759         mtx_lock(blp);
 1760 
 1761         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1762                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1763                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1764                         break;
 1765         }
 1766 
 1767         if (ncp == NULL) {
 1768                 mtx_unlock(blp);
 1769                 goto out_no_entry;
 1770         }
 1771 
 1772         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 1773         if (__predict_false(error != 0)) {
 1774                 zap_bucket_fail++;
 1775                 goto retry;
 1776         }
 1777         counter_u64_add(numposzaps, 1);
 1778         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1779         cache_free(ncp);
 1780         return (1);
 1781 out_no_entry:
 1782         counter_u64_add(nummisszap, 1);
 1783         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1784         return (0);
 1785 }
 1786 
 1787 static int __noinline
 1788 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1789     struct timespec *tsp, int *ticksp)
 1790 {
 1791         int ltype;
 1792 
 1793         *vpp = dvp;
 1794         counter_u64_add(dothits, 1);
 1795         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 1796         if (tsp != NULL)
 1797                 timespecclear(tsp);
 1798         if (ticksp != NULL)
 1799                 *ticksp = ticks;
 1800         vrefact(*vpp);
 1801         /*
 1802          * When we lookup "." we still can be asked to lock it
 1803          * differently...
 1804          */
 1805         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 1806         if (ltype != VOP_ISLOCKED(*vpp)) {
 1807                 if (ltype == LK_EXCLUSIVE) {
 1808                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 1809                         if (VN_IS_DOOMED((*vpp))) {
 1810                                 /* forced unmount */
 1811                                 vrele(*vpp);
 1812                                 *vpp = NULL;
 1813                                 return (ENOENT);
 1814                         }
 1815                 } else
 1816                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 1817         }
 1818         return (-1);
 1819 }
 1820 
 1821 static int __noinline
 1822 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1823     struct timespec *tsp, int *ticksp)
 1824 {
 1825         struct namecache_ts *ncp_ts;
 1826         struct namecache *ncp;
 1827         struct mtx *dvlp;
 1828         enum vgetstate vs;
 1829         int error, ltype;
 1830         bool whiteout;
 1831 
 1832         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
 1833 
 1834         if ((cnp->cn_flags & MAKEENTRY) == 0) {
 1835                 cache_remove_cnp(dvp, cnp);
 1836                 return (0);
 1837         }
 1838 
 1839         counter_u64_add(dotdothits, 1);
 1840 retry:
 1841         dvlp = VP2VNODELOCK(dvp);
 1842         mtx_lock(dvlp);
 1843         ncp = dvp->v_cache_dd;
 1844         if (ncp == NULL) {
 1845                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
 1846                 mtx_unlock(dvlp);
 1847                 return (0);
 1848         }
 1849         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1850                 if (ncp->nc_flag & NCF_NEGATIVE)
 1851                         *vpp = NULL;
 1852                 else
 1853                         *vpp = ncp->nc_vp;
 1854         } else
 1855                 *vpp = ncp->nc_dvp;
 1856         if (*vpp == NULL)
 1857                 goto negative_success;
 1858         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
 1859         cache_out_ts(ncp, tsp, ticksp);
 1860         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 1861             NCF_DTS && tsp != NULL) {
 1862                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1863                 *tsp = ncp_ts->nc_dotdottime;
 1864         }
 1865 
 1866         MPASS(dvp != *vpp);
 1867         ltype = VOP_ISLOCKED(dvp);
 1868         VOP_UNLOCK(dvp);
 1869         vs = vget_prep(*vpp);
 1870         mtx_unlock(dvlp);
 1871         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1872         vn_lock(dvp, ltype | LK_RETRY);
 1873         if (VN_IS_DOOMED(dvp)) {
 1874                 if (error == 0)
 1875                         vput(*vpp);
 1876                 *vpp = NULL;
 1877                 return (ENOENT);
 1878         }
 1879         if (error) {
 1880                 *vpp = NULL;
 1881                 goto retry;
 1882         }
 1883         return (-1);
 1884 negative_success:
 1885         if (__predict_false(cnp->cn_nameiop == CREATE)) {
 1886                 if (cnp->cn_flags & ISLASTCN) {
 1887                         counter_u64_add(numnegzaps, 1);
 1888                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
 1889                         mtx_unlock(dvlp);
 1890                         cache_free(ncp);
 1891                         return (0);
 1892                 }
 1893         }
 1894 
 1895         whiteout = (ncp->nc_flag & NCF_WHITE);
 1896         cache_out_ts(ncp, tsp, ticksp);
 1897         if (cache_neg_hit_prep(ncp))
 1898                 cache_neg_promote(ncp);
 1899         else
 1900                 cache_neg_hit_finish(ncp);
 1901         mtx_unlock(dvlp);
 1902         if (whiteout)
 1903                 cnp->cn_flags |= ISWHITEOUT;
 1904         return (ENOENT);
 1905 }
 1906 
 1907 /**
 1908  * Lookup a name in the name cache
 1909  *
 1910  * # Arguments
 1911  *
 1912  * - dvp:       Parent directory in which to search.
 1913  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
 1914  * - cnp:       Parameters of the name search.  The most interesting bits of
 1915  *              the cn_flags field have the following meanings:
 1916  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
 1917  *                      it up.
 1918  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
 1919  * - tsp:       Return storage for cache timestamp.  On a successful (positive
 1920  *              or negative) lookup, tsp will be filled with any timespec that
 1921  *              was stored when this cache entry was created.  However, it will
 1922  *              be clear for "." entries.
 1923  * - ticks:     Return storage for alternate cache timestamp.  On a successful
 1924  *              (positive or negative) lookup, it will contain the ticks value
 1925  *              that was current when the cache entry was created, unless cnp
 1926  *              was ".".
 1927  *
 1928  * Either both tsp and ticks have to be provided or neither of them.
 1929  *
 1930  * # Returns
 1931  *
 1932  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
 1933  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
 1934  *              to a forced unmount.  vpp will not be modified.  If the entry
 1935  *              is a whiteout, then the ISWHITEOUT flag will be set in
 1936  *              cnp->cn_flags.
 1937  * - 0:         A cache miss.  vpp will not be modified.
 1938  *
 1939  * # Locking
 1940  *
 1941  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
 1942  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
 1943  * lock is not recursively acquired.
 1944  */
 1945 static int __noinline
 1946 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1947     struct timespec *tsp, int *ticksp)
 1948 {
 1949         struct namecache *ncp;
 1950         struct mtx *blp;
 1951         uint32_t hash;
 1952         enum vgetstate vs;
 1953         int error;
 1954         bool whiteout;
 1955 
 1956         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 1957         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
 1958 
 1959 retry:
 1960         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1961         blp = HASH2BUCKETLOCK(hash);
 1962         mtx_lock(blp);
 1963 
 1964         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1965                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1966                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1967                         break;
 1968         }
 1969 
 1970         if (__predict_false(ncp == NULL)) {
 1971                 mtx_unlock(blp);
 1972                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 1973                 counter_u64_add(nummiss, 1);
 1974                 return (0);
 1975         }
 1976 
 1977         if (ncp->nc_flag & NCF_NEGATIVE)
 1978                 goto negative_success;
 1979 
 1980         counter_u64_add(numposhits, 1);
 1981         *vpp = ncp->nc_vp;
 1982         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 1983         cache_out_ts(ncp, tsp, ticksp);
 1984         MPASS(dvp != *vpp);
 1985         vs = vget_prep(*vpp);
 1986         mtx_unlock(blp);
 1987         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1988         if (error) {
 1989                 *vpp = NULL;
 1990                 goto retry;
 1991         }
 1992         return (-1);
 1993 negative_success:
 1994         /*
 1995          * We don't get here with regular lookup apart from corner cases.
 1996          */
 1997         if (__predict_true(cnp->cn_nameiop == CREATE)) {
 1998                 if (cnp->cn_flags & ISLASTCN) {
 1999                         counter_u64_add(numnegzaps, 1);
 2000                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 2001                         if (__predict_false(error != 0)) {
 2002                                 zap_bucket_fail2++;
 2003                                 goto retry;
 2004                         }
 2005                         cache_free(ncp);
 2006                         return (0);
 2007                 }
 2008         }
 2009 
 2010         whiteout = (ncp->nc_flag & NCF_WHITE);
 2011         cache_out_ts(ncp, tsp, ticksp);
 2012         if (cache_neg_hit_prep(ncp))
 2013                 cache_neg_promote(ncp);
 2014         else
 2015                 cache_neg_hit_finish(ncp);
 2016         mtx_unlock(blp);
 2017         if (whiteout)
 2018                 cnp->cn_flags |= ISWHITEOUT;
 2019         return (ENOENT);
 2020 }
 2021 
 2022 int
 2023 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 2024     struct timespec *tsp, int *ticksp)
 2025 {
 2026         struct namecache *ncp;
 2027         uint32_t hash;
 2028         enum vgetstate vs;
 2029         int error;
 2030         bool whiteout, neg_promote;
 2031         u_short nc_flag;
 2032 
 2033         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
 2034 
 2035 #ifdef DEBUG_CACHE
 2036         if (__predict_false(!doingcache)) {
 2037                 cnp->cn_flags &= ~MAKEENTRY;
 2038                 return (0);
 2039         }
 2040 #endif
 2041 
 2042         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 2043                 if (cnp->cn_namelen == 1)
 2044                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 2045                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
 2046                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
 2047         }
 2048 
 2049         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 2050 
 2051         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
 2052                 cache_remove_cnp(dvp, cnp);
 2053                 return (0);
 2054         }
 2055 
 2056         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 2057         vfs_smr_enter();
 2058 
 2059         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 2060                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 2061                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 2062                         break;
 2063         }
 2064 
 2065         if (__predict_false(ncp == NULL)) {
 2066                 vfs_smr_exit();
 2067                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 2068                 counter_u64_add(nummiss, 1);
 2069                 return (0);
 2070         }
 2071 
 2072         nc_flag = atomic_load_char(&ncp->nc_flag);
 2073         if (nc_flag & NCF_NEGATIVE)
 2074                 goto negative_success;
 2075 
 2076         counter_u64_add(numposhits, 1);
 2077         *vpp = ncp->nc_vp;
 2078         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 2079         cache_out_ts(ncp, tsp, ticksp);
 2080         MPASS(dvp != *vpp);
 2081         if (!cache_ncp_canuse(ncp)) {
 2082                 vfs_smr_exit();
 2083                 *vpp = NULL;
 2084                 goto out_fallback;
 2085         }
 2086         vs = vget_prep_smr(*vpp);
 2087         vfs_smr_exit();
 2088         if (__predict_false(vs == VGET_NONE)) {
 2089                 *vpp = NULL;
 2090                 goto out_fallback;
 2091         }
 2092         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 2093         if (error) {
 2094                 *vpp = NULL;
 2095                 goto out_fallback;
 2096         }
 2097         return (-1);
 2098 negative_success:
 2099         if (cnp->cn_nameiop == CREATE) {
 2100                 if (cnp->cn_flags & ISLASTCN) {
 2101                         vfs_smr_exit();
 2102                         goto out_fallback;
 2103                 }
 2104         }
 2105 
 2106         cache_out_ts(ncp, tsp, ticksp);
 2107         whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
 2108         neg_promote = cache_neg_hit_prep(ncp);
 2109         if (!cache_ncp_canuse(ncp)) {
 2110                 cache_neg_hit_abort(ncp);
 2111                 vfs_smr_exit();
 2112                 goto out_fallback;
 2113         }
 2114         if (neg_promote) {
 2115                 vfs_smr_exit();
 2116                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
 2117                         goto out_fallback;
 2118         } else {
 2119                 cache_neg_hit_finish(ncp);
 2120                 vfs_smr_exit();
 2121         }
 2122         if (whiteout)
 2123                 cnp->cn_flags |= ISWHITEOUT;
 2124         return (ENOENT);
 2125 out_fallback:
 2126         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
 2127 }
 2128 
 2129 struct celockstate {
 2130         struct mtx *vlp[3];
 2131         struct mtx *blp[2];
 2132 };
 2133 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 2134 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 2135 
 2136 static inline void
 2137 cache_celockstate_init(struct celockstate *cel)
 2138 {
 2139 
 2140         bzero(cel, sizeof(*cel));
 2141 }
 2142 
 2143 static void
 2144 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
 2145     struct vnode *dvp)
 2146 {
 2147         struct mtx *vlp1, *vlp2;
 2148 
 2149         MPASS(cel->vlp[0] == NULL);
 2150         MPASS(cel->vlp[1] == NULL);
 2151         MPASS(cel->vlp[2] == NULL);
 2152 
 2153         MPASS(vp != NULL || dvp != NULL);
 2154 
 2155         vlp1 = VP2VNODELOCK(vp);
 2156         vlp2 = VP2VNODELOCK(dvp);
 2157         cache_sort_vnodes(&vlp1, &vlp2);
 2158 
 2159         if (vlp1 != NULL) {
 2160                 mtx_lock(vlp1);
 2161                 cel->vlp[0] = vlp1;
 2162         }
 2163         mtx_lock(vlp2);
 2164         cel->vlp[1] = vlp2;
 2165 }
 2166 
 2167 static void
 2168 cache_unlock_vnodes_cel(struct celockstate *cel)
 2169 {
 2170 
 2171         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 2172 
 2173         if (cel->vlp[0] != NULL)
 2174                 mtx_unlock(cel->vlp[0]);
 2175         if (cel->vlp[1] != NULL)
 2176                 mtx_unlock(cel->vlp[1]);
 2177         if (cel->vlp[2] != NULL)
 2178                 mtx_unlock(cel->vlp[2]);
 2179 }
 2180 
 2181 static bool
 2182 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 2183 {
 2184         struct mtx *vlp;
 2185         bool ret;
 2186 
 2187         cache_assert_vlp_locked(cel->vlp[0]);
 2188         cache_assert_vlp_locked(cel->vlp[1]);
 2189         MPASS(cel->vlp[2] == NULL);
 2190 
 2191         MPASS(vp != NULL);
 2192         vlp = VP2VNODELOCK(vp);
 2193 
 2194         ret = true;
 2195         if (vlp >= cel->vlp[1]) {
 2196                 mtx_lock(vlp);
 2197         } else {
 2198                 if (mtx_trylock(vlp))
 2199                         goto out;
 2200                 cache_lock_vnodes_cel_3_failures++;
 2201                 cache_unlock_vnodes_cel(cel);
 2202                 if (vlp < cel->vlp[0]) {
 2203                         mtx_lock(vlp);
 2204                         mtx_lock(cel->vlp[0]);
 2205                         mtx_lock(cel->vlp[1]);
 2206                 } else {
 2207                         if (cel->vlp[0] != NULL)
 2208                                 mtx_lock(cel->vlp[0]);
 2209                         mtx_lock(vlp);
 2210                         mtx_lock(cel->vlp[1]);
 2211                 }
 2212                 ret = false;
 2213         }
 2214 out:
 2215         cel->vlp[2] = vlp;
 2216         return (ret);
 2217 }
 2218 
 2219 static void
 2220 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
 2221     struct mtx *blp2)
 2222 {
 2223 
 2224         MPASS(cel->blp[0] == NULL);
 2225         MPASS(cel->blp[1] == NULL);
 2226 
 2227         cache_sort_vnodes(&blp1, &blp2);
 2228 
 2229         if (blp1 != NULL) {
 2230                 mtx_lock(blp1);
 2231                 cel->blp[0] = blp1;
 2232         }
 2233         mtx_lock(blp2);
 2234         cel->blp[1] = blp2;
 2235 }
 2236 
 2237 static void
 2238 cache_unlock_buckets_cel(struct celockstate *cel)
 2239 {
 2240 
 2241         if (cel->blp[0] != NULL)
 2242                 mtx_unlock(cel->blp[0]);
 2243         mtx_unlock(cel->blp[1]);
 2244 }
 2245 
 2246 /*
 2247  * Lock part of the cache affected by the insertion.
 2248  *
 2249  * This means vnodelocks for dvp, vp and the relevant bucketlock.
 2250  * However, insertion can result in removal of an old entry. In this
 2251  * case we have an additional vnode and bucketlock pair to lock.
 2252  *
 2253  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
 2254  * preserving the locking order (smaller address first).
 2255  */
 2256 static void
 2257 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2258     uint32_t hash)
 2259 {
 2260         struct namecache *ncp;
 2261         struct mtx *blps[2];
 2262         u_char nc_flag;
 2263 
 2264         blps[0] = HASH2BUCKETLOCK(hash);
 2265         for (;;) {
 2266                 blps[1] = NULL;
 2267                 cache_lock_vnodes_cel(cel, dvp, vp);
 2268                 if (vp == NULL || vp->v_type != VDIR)
 2269                         break;
 2270                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 2271                 if (ncp == NULL)
 2272                         break;
 2273                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2274                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2275                         break;
 2276                 MPASS(ncp->nc_dvp == vp);
 2277                 blps[1] = NCP2BUCKETLOCK(ncp);
 2278                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2279                         break;
 2280                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2281                         break;
 2282                 /*
 2283                  * All vnodes got re-locked. Re-validate the state and if
 2284                  * nothing changed we are done. Otherwise restart.
 2285                  */
 2286                 if (ncp == vp->v_cache_dd &&
 2287                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2288                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2289                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2290                         break;
 2291                 cache_unlock_vnodes_cel(cel);
 2292                 cel->vlp[0] = NULL;
 2293                 cel->vlp[1] = NULL;
 2294                 cel->vlp[2] = NULL;
 2295         }
 2296         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2297 }
 2298 
 2299 static void
 2300 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2301     uint32_t hash)
 2302 {
 2303         struct namecache *ncp;
 2304         struct mtx *blps[2];
 2305         u_char nc_flag;
 2306 
 2307         blps[0] = HASH2BUCKETLOCK(hash);
 2308         for (;;) {
 2309                 blps[1] = NULL;
 2310                 cache_lock_vnodes_cel(cel, dvp, vp);
 2311                 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 2312                 if (ncp == NULL)
 2313                         break;
 2314                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2315                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2316                         break;
 2317                 MPASS(ncp->nc_dvp == dvp);
 2318                 blps[1] = NCP2BUCKETLOCK(ncp);
 2319                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2320                         break;
 2321                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2322                         break;
 2323                 if (ncp == dvp->v_cache_dd &&
 2324                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2325                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2326                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2327                         break;
 2328                 cache_unlock_vnodes_cel(cel);
 2329                 cel->vlp[0] = NULL;
 2330                 cel->vlp[1] = NULL;
 2331                 cel->vlp[2] = NULL;
 2332         }
 2333         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2334 }
 2335 
 2336 static void
 2337 cache_enter_unlock(struct celockstate *cel)
 2338 {
 2339 
 2340         cache_unlock_buckets_cel(cel);
 2341         cache_unlock_vnodes_cel(cel);
 2342 }
 2343 
 2344 static void __noinline
 2345 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
 2346     struct componentname *cnp)
 2347 {
 2348         struct celockstate cel;
 2349         struct namecache *ncp;
 2350         uint32_t hash;
 2351         int len;
 2352 
 2353         if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
 2354                 return;
 2355         len = cnp->cn_namelen;
 2356         cache_celockstate_init(&cel);
 2357         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2358         cache_enter_lock_dd(&cel, dvp, vp, hash);
 2359         ncp = dvp->v_cache_dd;
 2360         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
 2361                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
 2362                 cache_zap_locked(ncp);
 2363         } else {
 2364                 ncp = NULL;
 2365         }
 2366         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 2367         cache_enter_unlock(&cel);
 2368         if (ncp != NULL)
 2369                 cache_free(ncp);
 2370 }
 2371 
 2372 /*
 2373  * Add an entry to the cache.
 2374  */
 2375 void
 2376 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 2377     struct timespec *tsp, struct timespec *dtsp)
 2378 {
 2379         struct celockstate cel;
 2380         struct namecache *ncp, *n2, *ndd;
 2381         struct namecache_ts *ncp_ts;
 2382         struct nchashhead *ncpp;
 2383         uint32_t hash;
 2384         int flag;
 2385         int len;
 2386 
 2387         KASSERT(cnp->cn_namelen <= NAME_MAX,
 2388             ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
 2389             NAME_MAX));
 2390         VNPASS(!VN_IS_DOOMED(dvp), dvp);
 2391         VNPASS(dvp->v_type != VNON, dvp);
 2392         if (vp != NULL) {
 2393                 VNPASS(!VN_IS_DOOMED(vp), vp);
 2394                 VNPASS(vp->v_type != VNON, vp);
 2395         }
 2396         if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 2397                 KASSERT(dvp == vp,
 2398                     ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
 2399                     dvp, vp));
 2400         } else {
 2401                 KASSERT(dvp != vp,
 2402                     ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
 2403                     cnp->cn_nameptr, dvp));
 2404         }
 2405 
 2406 #ifdef DEBUG_CACHE
 2407         if (__predict_false(!doingcache))
 2408                 return;
 2409 #endif
 2410 
 2411         flag = 0;
 2412         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 2413                 if (cnp->cn_namelen == 1)
 2414                         return;
 2415                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 2416                         cache_enter_dotdot_prep(dvp, vp, cnp);
 2417                         flag = NCF_ISDOTDOT;
 2418                 }
 2419         }
 2420 
 2421         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 2422         if (ncp == NULL)
 2423                 return;
 2424 
 2425         cache_celockstate_init(&cel);
 2426         ndd = NULL;
 2427         ncp_ts = NULL;
 2428 
 2429         /*
 2430          * Calculate the hash key and setup as much of the new
 2431          * namecache entry as possible before acquiring the lock.
 2432          */
 2433         ncp->nc_flag = flag | NCF_WIP;
 2434         ncp->nc_vp = vp;
 2435         if (vp == NULL)
 2436                 cache_neg_init(ncp);
 2437         ncp->nc_dvp = dvp;
 2438         if (tsp != NULL) {
 2439                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 2440                 ncp_ts->nc_time = *tsp;
 2441                 ncp_ts->nc_ticks = ticks;
 2442                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
 2443                 if (dtsp != NULL) {
 2444                         ncp_ts->nc_dotdottime = *dtsp;
 2445                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 2446                 }
 2447         }
 2448         len = ncp->nc_nlen = cnp->cn_namelen;
 2449         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2450         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
 2451         ncp->nc_name[len] = '\0';
 2452         cache_enter_lock(&cel, dvp, vp, hash);
 2453 
 2454         /*
 2455          * See if this vnode or negative entry is already in the cache
 2456          * with this name.  This can happen with concurrent lookups of
 2457          * the same path name.
 2458          */
 2459         ncpp = NCHHASH(hash);
 2460         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
 2461                 if (n2->nc_dvp == dvp &&
 2462                     n2->nc_nlen == cnp->cn_namelen &&
 2463                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 2464                         MPASS(cache_ncp_canuse(n2));
 2465                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
 2466                                 KASSERT(vp == NULL,
 2467                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 2468                                     __func__, NULL, vp, cnp->cn_nameptr));
 2469                         else
 2470                                 KASSERT(n2->nc_vp == vp,
 2471                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 2472                                     __func__, n2->nc_vp, vp, cnp->cn_nameptr));
 2473                         /*
 2474                          * Entries are supposed to be immutable unless in the
 2475                          * process of getting destroyed. Accommodating for
 2476                          * changing timestamps is possible but not worth it.
 2477                          * This should be harmless in terms of correctness, in
 2478                          * the worst case resulting in an earlier expiration.
 2479                          * Alternatively, the found entry can be replaced
 2480                          * altogether.
 2481                          */
 2482                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
 2483 #if 0
 2484                         if (tsp != NULL) {
 2485                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
 2486                                     ("no NCF_TS"));
 2487                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 2488                                 n2_ts->nc_time = ncp_ts->nc_time;
 2489                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
 2490                                 if (dtsp != NULL) {
 2491                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 2492                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
 2493                                 }
 2494                         }
 2495 #endif
 2496                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
 2497                             vp);
 2498                         goto out_unlock_free;
 2499                 }
 2500         }
 2501 
 2502         if (flag == NCF_ISDOTDOT) {
 2503                 /*
 2504                  * See if we are trying to add .. entry, but some other lookup
 2505                  * has populated v_cache_dd pointer already.
 2506                  */
 2507                 if (dvp->v_cache_dd != NULL)
 2508                         goto out_unlock_free;
 2509                 KASSERT(vp == NULL || vp->v_type == VDIR,
 2510                     ("wrong vnode type %p", vp));
 2511                 atomic_thread_fence_rel();
 2512                 atomic_store_ptr(&dvp->v_cache_dd, ncp);
 2513         }
 2514 
 2515         if (vp != NULL) {
 2516                 if (flag != NCF_ISDOTDOT) {
 2517                         /*
 2518                          * For this case, the cache entry maps both the
 2519                          * directory name in it and the name ".." for the
 2520                          * directory's parent.
 2521                          */
 2522                         if ((ndd = vp->v_cache_dd) != NULL) {
 2523                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 2524                                         cache_zap_locked(ndd);
 2525                                 else
 2526                                         ndd = NULL;
 2527                         }
 2528                         atomic_thread_fence_rel();
 2529                         atomic_store_ptr(&vp->v_cache_dd, ncp);
 2530                 } else if (vp->v_type != VDIR) {
 2531                         if (vp->v_cache_dd != NULL) {
 2532                                 atomic_store_ptr(&vp->v_cache_dd, NULL);
 2533                         }
 2534                 }
 2535         }
 2536 
 2537         if (flag != NCF_ISDOTDOT) {
 2538                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 2539                         cache_hold_vnode(dvp);
 2540                 }
 2541                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 2542         }
 2543 
 2544         /*
 2545          * If the entry is "negative", we place it into the
 2546          * "negative" cache queue, otherwise, we place it into the
 2547          * destination vnode's cache entries queue.
 2548          */
 2549         if (vp != NULL) {
 2550                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 2551                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 2552                     vp);
 2553         } else {
 2554                 if (cnp->cn_flags & ISWHITEOUT)
 2555                         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
 2556                 cache_neg_insert(ncp);
 2557                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 2558                     ncp->nc_name);
 2559         }
 2560 
 2561         /*
 2562          * Insert the new namecache entry into the appropriate chain
 2563          * within the cache entries table.
 2564          */
 2565         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 2566 
 2567         atomic_thread_fence_rel();
 2568         /*
 2569          * Mark the entry as fully constructed.
 2570          * It is immutable past this point until its removal.
 2571          */
 2572         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
 2573 
 2574         cache_enter_unlock(&cel);
 2575         if (ndd != NULL)
 2576                 cache_free(ndd);
 2577         return;
 2578 out_unlock_free:
 2579         cache_enter_unlock(&cel);
 2580         cache_free(ncp);
 2581         return;
 2582 }
 2583 
 2584 /*
 2585  * A variant of the above accepting flags.
 2586  *
 2587  * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
 2588  *
 2589  * TODO: this routine is a hack. It blindly removes the old entry, even if it
 2590  * happens to match and it is doing it in an inefficient manner. It was added
 2591  * to accommodate NFS which runs into a case where the target for a given name
 2592  * may change from under it. Note this does nothing to solve the following
 2593  * race: 2 callers of cache_enter_time_flags pass a different target vnode for
 2594  * the same [dvp, cnp]. It may be argued that code doing this is broken.
 2595  */
 2596 void
 2597 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 2598     struct timespec *tsp, struct timespec *dtsp, int flags)
 2599 {
 2600 
 2601         MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
 2602 
 2603         if (flags & VFS_CACHE_DROPOLD)
 2604                 cache_remove_cnp(dvp, cnp);
 2605         cache_enter_time(dvp, vp, cnp, tsp, dtsp);
 2606 }
 2607 
 2608 static u_int
 2609 cache_roundup_2(u_int val)
 2610 {
 2611         u_int res;
 2612 
 2613         for (res = 1; res <= val; res <<= 1)
 2614                 continue;
 2615 
 2616         return (res);
 2617 }
 2618 
 2619 static struct nchashhead *
 2620 nchinittbl(u_long elements, u_long *hashmask)
 2621 {
 2622         struct nchashhead *hashtbl;
 2623         u_long hashsize, i;
 2624 
 2625         hashsize = cache_roundup_2(elements) / 2;
 2626 
 2627         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
 2628         for (i = 0; i < hashsize; i++)
 2629                 CK_SLIST_INIT(&hashtbl[i]);
 2630         *hashmask = hashsize - 1;
 2631         return (hashtbl);
 2632 }
 2633 
 2634 static void
 2635 ncfreetbl(struct nchashhead *hashtbl)
 2636 {
 2637 
 2638         free(hashtbl, M_VFSCACHE);
 2639 }
 2640 
 2641 /*
 2642  * Name cache initialization, from vfs_init() when we are booting
 2643  */
 2644 static void
 2645 nchinit(void *dummy __unused)
 2646 {
 2647         u_int i;
 2648 
 2649         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
 2650             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2651         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
 2652             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2653         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
 2654             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2655         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
 2656             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2657 
 2658         VFS_SMR_ZONE_SET(cache_zone_small);
 2659         VFS_SMR_ZONE_SET(cache_zone_small_ts);
 2660         VFS_SMR_ZONE_SET(cache_zone_large);
 2661         VFS_SMR_ZONE_SET(cache_zone_large_ts);
 2662 
 2663         ncsize = desiredvnodes * ncsizefactor;
 2664         cache_recalc_neg_min(ncnegminpct);
 2665         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
 2666         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
 2667         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
 2668                 ncbuckethash = 7;
 2669         if (ncbuckethash > nchash)
 2670                 ncbuckethash = nchash;
 2671         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 2672             M_WAITOK | M_ZERO);
 2673         for (i = 0; i < numbucketlocks; i++)
 2674                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
 2675         ncvnodehash = ncbuckethash;
 2676         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 2677             M_WAITOK | M_ZERO);
 2678         for (i = 0; i < numvnodelocks; i++)
 2679                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 2680 
 2681         for (i = 0; i < numneglists; i++) {
 2682                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
 2683                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 2684                 TAILQ_INIT(&neglists[i].nl_list);
 2685                 TAILQ_INIT(&neglists[i].nl_hotlist);
 2686         }
 2687 }
 2688 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 2689 
 2690 void
 2691 cache_vnode_init(struct vnode *vp)
 2692 {
 2693 
 2694         LIST_INIT(&vp->v_cache_src);
 2695         TAILQ_INIT(&vp->v_cache_dst);
 2696         vp->v_cache_dd = NULL;
 2697         cache_prehash(vp);
 2698 }
 2699 
 2700 /*
 2701  * Induce transient cache misses for lockless operation in cache_lookup() by
 2702  * using a temporary hash table.
 2703  *
 2704  * This will force a fs lookup.
 2705  *
 2706  * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
 2707  * to observe all CPUs not performing the lookup.
 2708  */
 2709 static void
 2710 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
 2711 {
 2712 
 2713         MPASS(temphash < nchash);
 2714         /*
 2715          * Change the size. The new size is smaller and can safely be used
 2716          * against the existing table. All lookups which now hash wrong will
 2717          * result in a cache miss, which all callers are supposed to know how
 2718          * to handle.
 2719          */
 2720         atomic_store_long(&nchash, temphash);
 2721         atomic_thread_fence_rel();
 2722         vfs_smr_synchronize();
 2723         /*
 2724          * At this point everyone sees the updated hash value, but they still
 2725          * see the old table.
 2726          */
 2727         atomic_store_ptr(&nchashtbl, temptbl);
 2728         atomic_thread_fence_rel();
 2729         vfs_smr_synchronize();
 2730         /*
 2731          * At this point everyone sees the updated table pointer and size pair.
 2732          */
 2733 }
 2734 
 2735 /*
 2736  * Set the new hash table.
 2737  *
 2738  * Similarly to cache_changesize_set_temp(), this has to synchronize against
 2739  * lockless operation in cache_lookup().
 2740  */
 2741 static void
 2742 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
 2743 {
 2744 
 2745         MPASS(nchash < new_hash);
 2746         /*
 2747          * Change the pointer first. This wont result in out of bounds access
 2748          * since the temporary table is guaranteed to be smaller.
 2749          */
 2750         atomic_store_ptr(&nchashtbl, new_tbl);
 2751         atomic_thread_fence_rel();
 2752         vfs_smr_synchronize();
 2753         /*
 2754          * At this point everyone sees the updated pointer value, but they
 2755          * still see the old size.
 2756          */
 2757         atomic_store_long(&nchash, new_hash);
 2758         atomic_thread_fence_rel();
 2759         vfs_smr_synchronize();
 2760         /*
 2761          * At this point everyone sees the updated table pointer and size pair.
 2762          */
 2763 }
 2764 
 2765 void
 2766 cache_changesize(u_long newmaxvnodes)
 2767 {
 2768         struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
 2769         u_long new_nchash, old_nchash, temphash;
 2770         struct namecache *ncp;
 2771         uint32_t hash;
 2772         u_long newncsize;
 2773         int i;
 2774 
 2775         newncsize = newmaxvnodes * ncsizefactor;
 2776         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 2777         if (newmaxvnodes < numbucketlocks)
 2778                 newmaxvnodes = numbucketlocks;
 2779 
 2780         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
 2781         /* If same hash table size, nothing to do */
 2782         if (nchash == new_nchash) {
 2783                 ncfreetbl(new_nchashtbl);
 2784                 return;
 2785         }
 2786 
 2787         temptbl = nchinittbl(1, &temphash);
 2788 
 2789         /*
 2790          * Move everything from the old hash table to the new table.
 2791          * None of the namecache entries in the table can be removed
 2792          * because to do so, they have to be removed from the hash table.
 2793          */
 2794         cache_lock_all_vnodes();
 2795         cache_lock_all_buckets();
 2796         old_nchashtbl = nchashtbl;
 2797         old_nchash = nchash;
 2798         cache_changesize_set_temp(temptbl, temphash);
 2799         for (i = 0; i <= old_nchash; i++) {
 2800                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
 2801                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 2802                             ncp->nc_dvp);
 2803                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
 2804                         CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
 2805                 }
 2806         }
 2807         ncsize = newncsize;
 2808         cache_recalc_neg_min(ncnegminpct);
 2809         cache_changesize_set_new(new_nchashtbl, new_nchash);
 2810         cache_unlock_all_buckets();
 2811         cache_unlock_all_vnodes();
 2812         ncfreetbl(old_nchashtbl);
 2813         ncfreetbl(temptbl);
 2814 }
 2815 
 2816 /*
 2817  * Remove all entries from and to a particular vnode.
 2818  */
 2819 static void
 2820 cache_purge_impl(struct vnode *vp)
 2821 {
 2822         struct cache_freebatch batch;
 2823         struct namecache *ncp;
 2824         struct mtx *vlp, *vlp2;
 2825 
 2826         TAILQ_INIT(&batch);
 2827         vlp = VP2VNODELOCK(vp);
 2828         vlp2 = NULL;
 2829         mtx_lock(vlp);
 2830 retry:
 2831         while (!LIST_EMPTY(&vp->v_cache_src)) {
 2832                 ncp = LIST_FIRST(&vp->v_cache_src);
 2833                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2834                         goto retry;
 2835                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2836         }
 2837         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 2838                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 2839                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2840                         goto retry;
 2841                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2842         }
 2843         ncp = vp->v_cache_dd;
 2844         if (ncp != NULL) {
 2845                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 2846                    ("lost dotdot link"));
 2847                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2848                         goto retry;
 2849                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2850         }
 2851         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 2852         mtx_unlock(vlp);
 2853         if (vlp2 != NULL)
 2854                 mtx_unlock(vlp2);
 2855         cache_free_batch(&batch);
 2856 }
 2857 
 2858 /*
 2859  * Opportunistic check to see if there is anything to do.
 2860  */
 2861 static bool
 2862 cache_has_entries(struct vnode *vp)
 2863 {
 2864 
 2865         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 2866             atomic_load_ptr(&vp->v_cache_dd) == NULL)
 2867                 return (false);
 2868         return (true);
 2869 }
 2870 
 2871 void
 2872 cache_purge(struct vnode *vp)
 2873 {
 2874 
 2875         SDT_PROBE1(vfs, namecache, purge, done, vp);
 2876         if (!cache_has_entries(vp))
 2877                 return;
 2878         cache_purge_impl(vp);
 2879 }
 2880 
 2881 /*
 2882  * Only to be used by vgone.
 2883  */
 2884 void
 2885 cache_purge_vgone(struct vnode *vp)
 2886 {
 2887         struct mtx *vlp;
 2888 
 2889         VNPASS(VN_IS_DOOMED(vp), vp);
 2890         if (cache_has_entries(vp)) {
 2891                 cache_purge_impl(vp);
 2892                 return;
 2893         }
 2894 
 2895         /*
 2896          * Serialize against a potential thread doing cache_purge.
 2897          */
 2898         vlp = VP2VNODELOCK(vp);
 2899         mtx_wait_unlocked(vlp);
 2900         if (cache_has_entries(vp)) {
 2901                 cache_purge_impl(vp);
 2902                 return;
 2903         }
 2904         return;
 2905 }
 2906 
 2907 /*
 2908  * Remove all negative entries for a particular directory vnode.
 2909  */
 2910 void
 2911 cache_purge_negative(struct vnode *vp)
 2912 {
 2913         struct cache_freebatch batch;
 2914         struct namecache *ncp, *nnp;
 2915         struct mtx *vlp;
 2916 
 2917         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 2918         if (LIST_EMPTY(&vp->v_cache_src))
 2919                 return;
 2920         TAILQ_INIT(&batch);
 2921         vlp = VP2VNODELOCK(vp);
 2922         mtx_lock(vlp);
 2923         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 2924                 if (!(ncp->nc_flag & NCF_NEGATIVE))
 2925                         continue;
 2926                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 2927                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2928         }
 2929         mtx_unlock(vlp);
 2930         cache_free_batch(&batch);
 2931 }
 2932 
 2933 /*
 2934  * Entry points for modifying VOP operations.
 2935  */
 2936 void
 2937 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
 2938     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
 2939 {
 2940 
 2941         ASSERT_VOP_IN_SEQC(fdvp);
 2942         ASSERT_VOP_IN_SEQC(fvp);
 2943         ASSERT_VOP_IN_SEQC(tdvp);
 2944         if (tvp != NULL)
 2945                 ASSERT_VOP_IN_SEQC(tvp);
 2946 
 2947         cache_purge(fvp);
 2948         if (tvp != NULL) {
 2949                 cache_purge(tvp);
 2950                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
 2951                     ("%s: lingering negative entry", __func__));
 2952         } else {
 2953                 cache_remove_cnp(tdvp, tcnp);
 2954         }
 2955 
 2956         /*
 2957          * TODO
 2958          *
 2959          * Historically renaming was always purging all revelang entries,
 2960          * but that's quite wasteful. In particular turns out that in many cases
 2961          * the target file is immediately accessed after rename, inducing a cache
 2962          * miss.
 2963          *
 2964          * Recode this to reduce relocking and reuse the existing entry (if any)
 2965          * instead of just removing it above and allocating a new one here.
 2966          */
 2967         if (cache_rename_add) {
 2968                 cache_enter(tdvp, fvp, tcnp);
 2969         }
 2970 }
 2971 
 2972 void
 2973 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
 2974 {
 2975 
 2976         ASSERT_VOP_IN_SEQC(dvp);
 2977         ASSERT_VOP_IN_SEQC(vp);
 2978         cache_purge(vp);
 2979 }
 2980 
 2981 #ifdef INVARIANTS
 2982 /*
 2983  * Validate that if an entry exists it matches.
 2984  */
 2985 void
 2986 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 2987 {
 2988         struct namecache *ncp;
 2989         struct mtx *blp;
 2990         uint32_t hash;
 2991 
 2992         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 2993         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 2994                 return;
 2995         blp = HASH2BUCKETLOCK(hash);
 2996         mtx_lock(blp);
 2997         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 2998                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 2999                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
 3000                         if (ncp->nc_vp != vp)
 3001                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
 3002                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
 3003                 }
 3004         }
 3005         mtx_unlock(blp);
 3006 }
 3007 #endif
 3008 
 3009 /*
 3010  * Flush all entries referencing a particular filesystem.
 3011  */
 3012 void
 3013 cache_purgevfs(struct mount *mp)
 3014 {
 3015         struct vnode *vp, *mvp;
 3016         size_t visited, purged;
 3017 
 3018         visited = purged = 0;
 3019         /*
 3020          * Somewhat wasteful iteration over all vnodes. Would be better to
 3021          * support filtering and avoid the interlock to begin with.
 3022          */
 3023         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 3024                 visited++;
 3025                 if (!cache_has_entries(vp)) {
 3026                         VI_UNLOCK(vp);
 3027                         continue;
 3028                 }
 3029                 vholdl(vp);
 3030                 VI_UNLOCK(vp);
 3031                 cache_purge(vp);
 3032                 purged++;
 3033                 vdrop(vp);
 3034         }
 3035 
 3036         SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
 3037 }
 3038 
 3039 /*
 3040  * Perform canonical checks and cache lookup and pass on to filesystem
 3041  * through the vop_cachedlookup only if needed.
 3042  */
 3043 
 3044 int
 3045 vfs_cache_lookup(struct vop_lookup_args *ap)
 3046 {
 3047         struct vnode *dvp;
 3048         int error;
 3049         struct vnode **vpp = ap->a_vpp;
 3050         struct componentname *cnp = ap->a_cnp;
 3051         int flags = cnp->cn_flags;
 3052 
 3053         *vpp = NULL;
 3054         dvp = ap->a_dvp;
 3055 
 3056         if (dvp->v_type != VDIR)
 3057                 return (ENOTDIR);
 3058 
 3059         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 3060             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 3061                 return (EROFS);
 3062 
 3063         error = vn_dir_check_exec(dvp, cnp);
 3064         if (error != 0)
 3065                 return (error);
 3066 
 3067         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 3068         if (error == 0)
 3069                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 3070         if (error == -1)
 3071                 return (0);
 3072         return (error);
 3073 }
 3074 
 3075 /* Implementation of the getcwd syscall. */
 3076 int
 3077 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 3078 {
 3079         char *buf, *retbuf;
 3080         size_t buflen;
 3081         int error;
 3082 
 3083         buflen = uap->buflen;
 3084         if (__predict_false(buflen < 2))
 3085                 return (EINVAL);
 3086         if (buflen > MAXPATHLEN)
 3087                 buflen = MAXPATHLEN;
 3088 
 3089         buf = uma_zalloc(namei_zone, M_WAITOK);
 3090         error = vn_getcwd(buf, &retbuf, &buflen);
 3091         if (error == 0)
 3092                 error = copyout(retbuf, uap->buf, buflen);
 3093         uma_zfree(namei_zone, buf);
 3094         return (error);
 3095 }
 3096 
 3097 int
 3098 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
 3099 {
 3100         struct pwd *pwd;
 3101         int error;
 3102 
 3103         vfs_smr_enter();
 3104         pwd = pwd_get_smr();
 3105         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
 3106             buflen, 0);
 3107         VFS_SMR_ASSERT_NOT_ENTERED();
 3108         if (error < 0) {
 3109                 pwd = pwd_hold(curthread);
 3110                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
 3111                     retbuf, buflen);
 3112                 pwd_drop(pwd);
 3113         }
 3114 
 3115 #ifdef KTRACE
 3116         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
 3117                 ktrnamei(*retbuf);
 3118 #endif
 3119         return (error);
 3120 }
 3121 
 3122 static int
 3123 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
 3124     size_t size, int flags, enum uio_seg pathseg)
 3125 {
 3126         struct nameidata nd;
 3127         char *retbuf, *freebuf;
 3128         int error;
 3129 
 3130         if (flags != 0)
 3131                 return (EINVAL);
 3132         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
 3133             pathseg, path, fd, &cap_fstat_rights, td);
 3134         if ((error = namei(&nd)) != 0)
 3135                 return (error);
 3136 
 3137         if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
 3138             (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
 3139                 /*
 3140                  * This happens if vp is a file mount. The call to
 3141                  * vn_fullpath_hardlink can panic if path resolution can't be
 3142                  * handled without the directory.
 3143                  *
 3144                  * To resolve this, we find the vnode which was mounted on -
 3145                  * this should have a unique global path since we disallow
 3146                  * mounting on linked files.
 3147                  */
 3148                 struct vnode *covered_vp;
 3149                 error = vn_lock(nd.ni_vp, LK_SHARED);
 3150                 if (error != 0)
 3151                         goto out;
 3152                 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
 3153                 vref(covered_vp);
 3154                 VOP_UNLOCK(nd.ni_vp);
 3155                 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
 3156                 vrele(covered_vp);
 3157         } else {
 3158                 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr,
 3159                     nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size);
 3160         }
 3161         if (error == 0) {
 3162                 error = copyout(retbuf, buf, size);
 3163                 free(freebuf, M_TEMP);
 3164         }
 3165 out:
 3166         NDFREE(&nd, 0);
 3167         return (error);
 3168 }
 3169 
 3170 int
 3171 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
 3172 {
 3173 
 3174         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
 3175             uap->flags, UIO_USERSPACE));
 3176 }
 3177 
 3178 /*
 3179  * Retrieve the full filesystem path that correspond to a vnode from the name
 3180  * cache (if available)
 3181  */
 3182 int
 3183 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
 3184 {
 3185         struct pwd *pwd;
 3186         char *buf;
 3187         size_t buflen;
 3188         int error;
 3189 
 3190         if (__predict_false(vp == NULL))
 3191                 return (EINVAL);
 3192 
 3193         buflen = MAXPATHLEN;
 3194         buf = malloc(buflen, M_TEMP, M_WAITOK);
 3195         vfs_smr_enter();
 3196         pwd = pwd_get_smr();
 3197         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
 3198         VFS_SMR_ASSERT_NOT_ENTERED();
 3199         if (error < 0) {
 3200                 pwd = pwd_hold(curthread);
 3201                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
 3202                 pwd_drop(pwd);
 3203         }
 3204         if (error == 0)
 3205                 *freebuf = buf;
 3206         else
 3207                 free(buf, M_TEMP);
 3208         return (error);
 3209 }
 3210 
 3211 /*
 3212  * This function is similar to vn_fullpath, but it attempts to lookup the
 3213  * pathname relative to the global root mount point.  This is required for the
 3214  * auditing sub-system, as audited pathnames must be absolute, relative to the
 3215  * global root mount point.
 3216  */
 3217 int
 3218 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
 3219 {
 3220         char *buf;
 3221         size_t buflen;
 3222         int error;
 3223 
 3224         if (__predict_false(vp == NULL))
 3225                 return (EINVAL);
 3226         buflen = MAXPATHLEN;
 3227         buf = malloc(buflen, M_TEMP, M_WAITOK);
 3228         vfs_smr_enter();
 3229         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
 3230         VFS_SMR_ASSERT_NOT_ENTERED();
 3231         if (error < 0) {
 3232                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
 3233         }
 3234         if (error == 0)
 3235                 *freebuf = buf;
 3236         else
 3237                 free(buf, M_TEMP);
 3238         return (error);
 3239 }
 3240 
 3241 static struct namecache *
 3242 vn_dd_from_dst(struct vnode *vp)
 3243 {
 3244         struct namecache *ncp;
 3245 
 3246         cache_assert_vnode_locked(vp);
 3247         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
 3248                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3249                         return (ncp);
 3250         }
 3251         return (NULL);
 3252 }
 3253 
 3254 int
 3255 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
 3256 {
 3257         struct vnode *dvp;
 3258         struct namecache *ncp;
 3259         struct mtx *vlp;
 3260         int error;
 3261 
 3262         vlp = VP2VNODELOCK(*vp);
 3263         mtx_lock(vlp);
 3264         ncp = (*vp)->v_cache_dd;
 3265         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
 3266                 KASSERT(ncp == vn_dd_from_dst(*vp),
 3267                     ("%s: mismatch for dd entry (%p != %p)", __func__,
 3268                     ncp, vn_dd_from_dst(*vp)));
 3269         } else {
 3270                 ncp = vn_dd_from_dst(*vp);
 3271         }
 3272         if (ncp != NULL) {
 3273                 if (*buflen < ncp->nc_nlen) {
 3274                         mtx_unlock(vlp);
 3275                         vrele(*vp);
 3276                         counter_u64_add(numfullpathfail4, 1);
 3277                         error = ENOMEM;
 3278                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3279                             vp, NULL);
 3280                         return (error);
 3281                 }
 3282                 *buflen -= ncp->nc_nlen;
 3283                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3284                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 3285                     ncp->nc_name, vp);
 3286                 dvp = *vp;
 3287                 *vp = ncp->nc_dvp;
 3288                 vref(*vp);
 3289                 mtx_unlock(vlp);
 3290                 vrele(dvp);
 3291                 return (0);
 3292         }
 3293         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 3294 
 3295         mtx_unlock(vlp);
 3296         vn_lock(*vp, LK_SHARED | LK_RETRY);
 3297         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
 3298         vput(*vp);
 3299         if (error) {
 3300                 counter_u64_add(numfullpathfail2, 1);
 3301                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 3302                 return (error);
 3303         }
 3304 
 3305         *vp = dvp;
 3306         if (VN_IS_DOOMED(dvp)) {
 3307                 /* forced unmount */
 3308                 vrele(dvp);
 3309                 error = ENOENT;
 3310                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 3311                 return (error);
 3312         }
 3313         /*
 3314          * *vp has its use count incremented still.
 3315          */
 3316 
 3317         return (0);
 3318 }
 3319 
 3320 /*
 3321  * Resolve a directory to a pathname.
 3322  *
 3323  * The name of the directory can always be found in the namecache or fetched
 3324  * from the filesystem. There is also guaranteed to be only one parent, meaning
 3325  * we can just follow vnodes up until we find the root.
 3326  *
 3327  * The vnode must be referenced.
 3328  */
 3329 static int
 3330 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3331     size_t *len, size_t addend)
 3332 {
 3333 #ifdef KDTRACE_HOOKS
 3334         struct vnode *startvp = vp;
 3335 #endif
 3336         struct vnode *vp1;
 3337         size_t buflen;
 3338         int error;
 3339         bool slash_prefixed;
 3340 
 3341         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
 3342         VNPASS(vp->v_usecount > 0, vp);
 3343 
 3344         buflen = *len;
 3345 
 3346         slash_prefixed = true;
 3347         if (addend == 0) {
 3348                 MPASS(*len >= 2);
 3349                 buflen--;
 3350                 buf[buflen] = '\0';
 3351                 slash_prefixed = false;
 3352         }
 3353 
 3354         error = 0;
 3355 
 3356         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 3357         counter_u64_add(numfullpathcalls, 1);
 3358         while (vp != rdir && vp != rootvnode) {
 3359                 /*
 3360                  * The vp vnode must be already fully constructed,
 3361                  * since it is either found in namecache or obtained
 3362                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 3363                  * without obtaining the vnode lock.
 3364                  */
 3365                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3366                         vn_lock(vp, LK_RETRY | LK_SHARED);
 3367 
 3368                         /*
 3369                          * With the vnode locked, check for races with
 3370                          * unmount, forced or not.  Note that we
 3371                          * already verified that vp is not equal to
 3372                          * the root vnode, which means that
 3373                          * mnt_vnodecovered can be NULL only for the
 3374                          * case of unmount.
 3375                          */
 3376                         if (VN_IS_DOOMED(vp) ||
 3377                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 3378                             vp1->v_mountedhere != vp->v_mount) {
 3379                                 vput(vp);
 3380                                 error = ENOENT;
 3381                                 SDT_PROBE3(vfs, namecache, fullpath, return,
 3382                                     error, vp, NULL);
 3383                                 break;
 3384                         }
 3385 
 3386                         vref(vp1);
 3387                         vput(vp);
 3388                         vp = vp1;
 3389                         continue;
 3390                 }
 3391                 if (vp->v_type != VDIR) {
 3392                         vrele(vp);
 3393                         counter_u64_add(numfullpathfail1, 1);
 3394                         error = ENOTDIR;
 3395                         SDT_PROBE3(vfs, namecache, fullpath, return,
 3396                             error, vp, NULL);
 3397                         break;
 3398                 }
 3399                 error = vn_vptocnp(&vp, buf, &buflen);
 3400                 if (error)
 3401                         break;
 3402                 if (buflen == 0) {
 3403                         vrele(vp);
 3404                         error = ENOMEM;
 3405                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3406                             startvp, NULL);
 3407                         break;
 3408                 }
 3409                 buf[--buflen] = '/';
 3410                 slash_prefixed = true;
 3411         }
 3412         if (error)
 3413                 return (error);
 3414         if (!slash_prefixed) {
 3415                 if (buflen == 0) {
 3416                         vrele(vp);
 3417                         counter_u64_add(numfullpathfail4, 1);
 3418                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 3419                             startvp, NULL);
 3420                         return (ENOMEM);
 3421                 }
 3422                 buf[--buflen] = '/';
 3423         }
 3424         counter_u64_add(numfullpathfound, 1);
 3425         vrele(vp);
 3426 
 3427         *retbuf = buf + buflen;
 3428         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
 3429         *len -= buflen;
 3430         *len += addend;
 3431         return (0);
 3432 }
 3433 
 3434 /*
 3435  * Resolve an arbitrary vnode to a pathname.
 3436  *
 3437  * Note 2 caveats:
 3438  * - hardlinks are not tracked, thus if the vnode is not a directory this can
 3439  *   resolve to a different path than the one used to find it
 3440  * - namecache is not mandatory, meaning names are not guaranteed to be added
 3441  *   (in which case resolving fails)
 3442  */
 3443 static void __inline
 3444 cache_rev_failed_impl(int *reason, int line)
 3445 {
 3446 
 3447         *reason = line;
 3448 }
 3449 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
 3450 
 3451 static int
 3452 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 3453     char **retbuf, size_t *buflen, size_t addend)
 3454 {
 3455 #ifdef KDTRACE_HOOKS
 3456         struct vnode *startvp = vp;
 3457 #endif
 3458         struct vnode *tvp;
 3459         struct mount *mp;
 3460         struct namecache *ncp;
 3461         size_t orig_buflen;
 3462         int reason;
 3463         int error;
 3464 #ifdef KDTRACE_HOOKS
 3465         int i;
 3466 #endif
 3467         seqc_t vp_seqc, tvp_seqc;
 3468         u_char nc_flag;
 3469 
 3470         VFS_SMR_ASSERT_ENTERED();
 3471 
 3472         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 3473                 vfs_smr_exit();
 3474                 return (-1);
 3475         }
 3476 
 3477         orig_buflen = *buflen;
 3478 
 3479         if (addend == 0) {
 3480                 MPASS(*buflen >= 2);
 3481                 *buflen -= 1;
 3482                 buf[*buflen] = '\0';
 3483         }
 3484 
 3485         if (vp == rdir || vp == rootvnode) {
 3486                 if (addend == 0) {
 3487                         *buflen -= 1;
 3488                         buf[*buflen] = '/';
 3489                 }
 3490                 goto out_ok;
 3491         }
 3492 
 3493 #ifdef KDTRACE_HOOKS
 3494         i = 0;
 3495 #endif
 3496         error = -1;
 3497         ncp = NULL; /* for sdt probe down below */
 3498         vp_seqc = vn_seqc_read_any(vp);
 3499         if (seqc_in_modify(vp_seqc)) {
 3500                 cache_rev_failed(&reason);
 3501                 goto out_abort;
 3502         }
 3503 
 3504         for (;;) {
 3505 #ifdef KDTRACE_HOOKS
 3506                 i++;
 3507 #endif
 3508                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3509                         mp = atomic_load_ptr(&vp->v_mount);
 3510                         if (mp == NULL) {
 3511                                 cache_rev_failed(&reason);
 3512                                 goto out_abort;
 3513                         }
 3514                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
 3515                         tvp_seqc = vn_seqc_read_any(tvp);
 3516                         if (seqc_in_modify(tvp_seqc)) {
 3517                                 cache_rev_failed(&reason);
 3518                                 goto out_abort;
 3519                         }
 3520                         if (!vn_seqc_consistent(vp, vp_seqc)) {
 3521                                 cache_rev_failed(&reason);
 3522                                 goto out_abort;
 3523                         }
 3524                         vp = tvp;
 3525                         vp_seqc = tvp_seqc;
 3526                         continue;
 3527                 }
 3528                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 3529                 if (ncp == NULL) {
 3530                         cache_rev_failed(&reason);
 3531                         goto out_abort;
 3532                 }
 3533                 nc_flag = atomic_load_char(&ncp->nc_flag);
 3534                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
 3535                         cache_rev_failed(&reason);
 3536                         goto out_abort;
 3537                 }
 3538                 if (ncp->nc_nlen >= *buflen) {
 3539                         cache_rev_failed(&reason);
 3540                         error = ENOMEM;
 3541                         goto out_abort;
 3542                 }
 3543                 *buflen -= ncp->nc_nlen;
 3544                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3545                 *buflen -= 1;
 3546                 buf[*buflen] = '/';
 3547                 tvp = ncp->nc_dvp;
 3548                 tvp_seqc = vn_seqc_read_any(tvp);
 3549                 if (seqc_in_modify(tvp_seqc)) {
 3550                         cache_rev_failed(&reason);
 3551                         goto out_abort;
 3552                 }
 3553                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 3554                         cache_rev_failed(&reason);
 3555                         goto out_abort;
 3556                 }
 3557                 /*
 3558                  * Acquire fence provided by vn_seqc_read_any above.
 3559                  */
 3560                 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
 3561                         cache_rev_failed(&reason);
 3562                         goto out_abort;
 3563                 }
 3564                 if (!cache_ncp_canuse(ncp)) {
 3565                         cache_rev_failed(&reason);
 3566                         goto out_abort;
 3567                 }
 3568                 vp = tvp;
 3569                 vp_seqc = tvp_seqc;
 3570                 if (vp == rdir || vp == rootvnode)
 3571                         break;
 3572         }
 3573 out_ok:
 3574         vfs_smr_exit();
 3575         *retbuf = buf + *buflen;
 3576         *buflen = orig_buflen - *buflen + addend;
 3577         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
 3578         return (0);
 3579 
 3580 out_abort:
 3581         *buflen = orig_buflen;
 3582         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
 3583         vfs_smr_exit();
 3584         return (error);
 3585 }
 3586 
 3587 static int
 3588 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3589     size_t *buflen)
 3590 {
 3591         size_t orig_buflen, addend;
 3592         int error;
 3593 
 3594         if (*buflen < 2)
 3595                 return (EINVAL);
 3596 
 3597         orig_buflen = *buflen;
 3598 
 3599         vref(vp);
 3600         addend = 0;
 3601         if (vp->v_type != VDIR) {
 3602                 *buflen -= 1;
 3603                 buf[*buflen] = '\0';
 3604                 error = vn_vptocnp(&vp, buf, buflen);
 3605                 if (error)
 3606                         return (error);
 3607                 if (*buflen == 0) {
 3608                         vrele(vp);
 3609                         return (ENOMEM);
 3610                 }
 3611                 *buflen -= 1;
 3612                 buf[*buflen] = '/';
 3613                 addend = orig_buflen - *buflen;
 3614         }
 3615 
 3616         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
 3617 }
 3618 
 3619 /*
 3620  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
 3621  *
 3622  * Since the namecache does not track hardlinks, the caller is
 3623  * expected to first look up the target vnode with SAVENAME |
 3624  * WANTPARENT flags passed to namei to get dvp and vp.
 3625  *
 3626  * Then we have 2 cases:
 3627  * - if the found vnode is a directory, the path can be constructed just by
 3628  *   following names up the chain
 3629  * - otherwise we populate the buffer with the saved name and start resolving
 3630  *   from the parent
 3631  */
 3632 int
 3633 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
 3634     const char *hrdl_name, size_t hrdl_name_length,
 3635     char **retbuf, char **freebuf, size_t *buflen)
 3636 {
 3637         char *buf, *tmpbuf;
 3638         struct pwd *pwd;
 3639         size_t addend;
 3640         int error;
 3641         enum vtype type;
 3642 
 3643         if (*buflen < 2)
 3644                 return (EINVAL);
 3645         if (*buflen > MAXPATHLEN)
 3646                 *buflen = MAXPATHLEN;
 3647 
 3648         buf = malloc(*buflen, M_TEMP, M_WAITOK);
 3649 
 3650         addend = 0;
 3651 
 3652         /*
 3653          * Check for VBAD to work around the vp_crossmp bug in lookup().
 3654          *
 3655          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
 3656          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
 3657          * If the type is VDIR (like in this very case) we can skip looking
 3658          * at ni_dvp in the first place. However, since vnodes get passed here
 3659          * unlocked the target may transition to doomed state (type == VBAD)
 3660          * before we get to evaluate the condition. If this happens, we will
 3661          * populate part of the buffer and descend to vn_fullpath_dir with
 3662          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
 3663          *
 3664          * This should be atomic_load(&vp->v_type) but it is illegal to take
 3665          * an address of a bit field, even if said field is sized to char.
 3666          * Work around the problem by reading the value into a full-sized enum
 3667          * and then re-reading it with atomic_load which will still prevent
 3668          * the compiler from re-reading down the road.
 3669          */
 3670         type = vp->v_type;
 3671         type = atomic_load_int(&type);
 3672         if (type == VBAD) {
 3673                 error = ENOENT;
 3674                 goto out_bad;
 3675         }
 3676         if (type != VDIR) {
 3677                 addend = hrdl_name_length + 2;
 3678                 if (*buflen < addend) {
 3679                         error = ENOMEM;
 3680                         goto out_bad;
 3681                 }
 3682                 *buflen -= addend;
 3683                 tmpbuf = buf + *buflen;
 3684                 tmpbuf[0] = '/';
 3685                 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
 3686                 tmpbuf[addend - 1] = '\0';
 3687                 vp = dvp;
 3688         }
 3689 
 3690         vfs_smr_enter();
 3691         pwd = pwd_get_smr();
 3692         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3693             addend);
 3694         VFS_SMR_ASSERT_NOT_ENTERED();
 3695         if (error < 0) {
 3696                 pwd = pwd_hold(curthread);
 3697                 vref(vp);
 3698                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3699                     addend);
 3700                 pwd_drop(pwd);
 3701         }
 3702         if (error != 0)
 3703                 goto out_bad;
 3704 
 3705         *freebuf = buf;
 3706 
 3707         return (0);
 3708 out_bad:
 3709         free(buf, M_TEMP);
 3710         return (error);
 3711 }
 3712 
 3713 struct vnode *
 3714 vn_dir_dd_ino(struct vnode *vp)
 3715 {
 3716         struct namecache *ncp;
 3717         struct vnode *ddvp;
 3718         struct mtx *vlp;
 3719         enum vgetstate vs;
 3720 
 3721         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 3722         vlp = VP2VNODELOCK(vp);
 3723         mtx_lock(vlp);
 3724         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 3725                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 3726                         continue;
 3727                 ddvp = ncp->nc_dvp;
 3728                 vs = vget_prep(ddvp);
 3729                 mtx_unlock(vlp);
 3730                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
 3731                         return (NULL);
 3732                 return (ddvp);
 3733         }
 3734         mtx_unlock(vlp);
 3735         return (NULL);
 3736 }
 3737 
 3738 int
 3739 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 3740 {
 3741         struct namecache *ncp;
 3742         struct mtx *vlp;
 3743         int l;
 3744 
 3745         vlp = VP2VNODELOCK(vp);
 3746         mtx_lock(vlp);
 3747         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 3748                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3749                         break;
 3750         if (ncp == NULL) {
 3751                 mtx_unlock(vlp);
 3752                 return (ENOENT);
 3753         }
 3754         l = min(ncp->nc_nlen, buflen - 1);
 3755         memcpy(buf, ncp->nc_name, l);
 3756         mtx_unlock(vlp);
 3757         buf[l] = '\0';
 3758         return (0);
 3759 }
 3760 
 3761 /*
 3762  * This function updates path string to vnode's full global path
 3763  * and checks the size of the new path string against the pathlen argument.
 3764  *
 3765  * Requires a locked, referenced vnode.
 3766  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 3767  *
 3768  * If vp is a directory, the call to vn_fullpath_global() always succeeds
 3769  * because it falls back to the ".." lookup if the namecache lookup fails.
 3770  */
 3771 int
 3772 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
 3773     u_int pathlen)
 3774 {
 3775         struct nameidata nd;
 3776         struct vnode *vp1;
 3777         char *rpath, *fbuf;
 3778         int error;
 3779 
 3780         ASSERT_VOP_ELOCKED(vp, __func__);
 3781 
 3782         /* Construct global filesystem path from vp. */
 3783         VOP_UNLOCK(vp);
 3784         error = vn_fullpath_global(vp, &rpath, &fbuf);
 3785 
 3786         if (error != 0) {
 3787                 vrele(vp);
 3788                 return (error);
 3789         }
 3790 
 3791         if (strlen(rpath) >= pathlen) {
 3792                 vrele(vp);
 3793                 error = ENAMETOOLONG;
 3794                 goto out;
 3795         }
 3796 
 3797         /*
 3798          * Re-lookup the vnode by path to detect a possible rename.
 3799          * As a side effect, the vnode is relocked.
 3800          * If vnode was renamed, return ENOENT.
 3801          */
 3802         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 3803             UIO_SYSSPACE, path, td);
 3804         error = namei(&nd);
 3805         if (error != 0) {
 3806                 vrele(vp);
 3807                 goto out;
 3808         }
 3809         NDFREE(&nd, NDF_ONLY_PNBUF);
 3810         vp1 = nd.ni_vp;
 3811         vrele(vp);
 3812         if (vp1 == vp)
 3813                 strcpy(path, rpath);
 3814         else {
 3815                 vput(vp1);
 3816                 error = ENOENT;
 3817         }
 3818 
 3819 out:
 3820         free(fbuf, M_TEMP);
 3821         return (error);
 3822 }
 3823 
 3824 /*
 3825  * This is similar to vn_path_to_global_path but allows for regular
 3826  * files which may not be present in the cache.
 3827  *
 3828  * Requires a locked, referenced vnode.
 3829  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 3830  */
 3831 int
 3832 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
 3833     struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
 3834     size_t leaf_length)
 3835 {
 3836         struct nameidata nd;
 3837         struct vnode *vp1;
 3838         char *rpath, *fbuf;
 3839         size_t len;
 3840         int error;
 3841 
 3842         ASSERT_VOP_ELOCKED(vp, __func__);
 3843 
 3844         /*
 3845          * Construct global filesystem path from dvp, vp and leaf
 3846          * name.
 3847          */
 3848         VOP_UNLOCK(vp);
 3849         error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
 3850             &rpath, &fbuf, &len);
 3851 
 3852         if (error != 0) {
 3853                 vrele(vp);
 3854                 goto out;
 3855         }
 3856 
 3857         if (strlen(rpath) >= pathlen) {
 3858                 vrele(vp);
 3859                 error = ENAMETOOLONG;
 3860                 goto out;
 3861         }
 3862 
 3863         /*
 3864          * Re-lookup the vnode by path to detect a possible rename.
 3865          * As a side effect, the vnode is relocked.
 3866          * If vnode was renamed, return ENOENT.
 3867          */
 3868         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td);
 3869         error = namei(&nd);
 3870         if (error != 0) {
 3871                 vrele(vp);
 3872                 goto out;
 3873         }
 3874         NDFREE_PNBUF(&nd);
 3875         vp1 = nd.ni_vp;
 3876         vrele(vp);
 3877         if (vp1 == vp)
 3878                 strcpy(path, rpath);
 3879         else {
 3880                 vput(vp1);
 3881                 error = ENOENT;
 3882         }
 3883 
 3884 out:
 3885         free(fbuf, M_TEMP);
 3886         return (error);
 3887 }
 3888 
 3889 #ifdef DDB
 3890 static void
 3891 db_print_vpath(struct vnode *vp)
 3892 {
 3893 
 3894         while (vp != NULL) {
 3895                 db_printf("%p: ", vp);
 3896                 if (vp == rootvnode) {
 3897                         db_printf("/");
 3898                         vp = NULL;
 3899                 } else {
 3900                         if (vp->v_vflag & VV_ROOT) {
 3901                                 db_printf("<mount point>");
 3902                                 vp = vp->v_mount->mnt_vnodecovered;
 3903                         } else {
 3904                                 struct namecache *ncp;
 3905                                 char *ncn;
 3906                                 int i;
 3907 
 3908                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 3909                                 if (ncp != NULL) {
 3910                                         ncn = ncp->nc_name;
 3911                                         for (i = 0; i < ncp->nc_nlen; i++)
 3912                                                 db_printf("%c", *ncn++);
 3913                                         vp = ncp->nc_dvp;
 3914                                 } else {
 3915                                         vp = NULL;
 3916                                 }
 3917                         }
 3918                 }
 3919                 db_printf("\n");
 3920         }
 3921 
 3922         return;
 3923 }
 3924 
 3925 DB_SHOW_COMMAND(vpath, db_show_vpath)
 3926 {
 3927         struct vnode *vp;
 3928 
 3929         if (!have_addr) {
 3930                 db_printf("usage: show vpath <struct vnode *>\n");
 3931                 return;
 3932         }
 3933 
 3934         vp = (struct vnode *)addr;
 3935         db_print_vpath(vp);
 3936 }
 3937 
 3938 #endif
 3939 
 3940 static int cache_fast_lookup = 1;
 3941 
 3942 #define CACHE_FPL_FAILED        -2020
 3943 
 3944 void
 3945 cache_fast_lookup_enabled_recalc(void)
 3946 {
 3947         int lookup_flag;
 3948         int mac_on;
 3949 
 3950 #ifdef MAC
 3951         mac_on = mac_vnode_check_lookup_enabled();
 3952         mac_on |= mac_vnode_check_readlink_enabled();
 3953 #else
 3954         mac_on = 0;
 3955 #endif
 3956 
 3957         lookup_flag = atomic_load_int(&cache_fast_lookup);
 3958         if (lookup_flag && !mac_on) {
 3959                 atomic_store_char(&cache_fast_lookup_enabled, true);
 3960         } else {
 3961                 atomic_store_char(&cache_fast_lookup_enabled, false);
 3962         }
 3963 }
 3964 
 3965 static int
 3966 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
 3967 {
 3968         int error, old;
 3969 
 3970         old = atomic_load_int(&cache_fast_lookup);
 3971         error = sysctl_handle_int(oidp, arg1, arg2, req);
 3972         if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
 3973                 cache_fast_lookup_enabled_recalc();
 3974         return (error);
 3975 }
 3976 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
 3977     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
 3978 
 3979 /*
 3980  * Components of nameidata (or objects it can point to) which may
 3981  * need restoring in case fast path lookup fails.
 3982  */
 3983 struct nameidata_outer {
 3984         size_t ni_pathlen;
 3985         int cn_flags;
 3986 };
 3987 
 3988 struct nameidata_saved {
 3989 #ifdef INVARIANTS
 3990         char *cn_nameptr;
 3991         size_t ni_pathlen;
 3992 #endif
 3993 };
 3994 
 3995 #ifdef INVARIANTS
 3996 struct cache_fpl_debug {
 3997         size_t ni_pathlen;
 3998 };
 3999 #endif
 4000 
 4001 struct cache_fpl {
 4002         struct nameidata *ndp;
 4003         struct componentname *cnp;
 4004         char *nulchar;
 4005         struct vnode *dvp;
 4006         struct vnode *tvp;
 4007         seqc_t dvp_seqc;
 4008         seqc_t tvp_seqc;
 4009         uint32_t hash;
 4010         struct nameidata_saved snd;
 4011         struct nameidata_outer snd_outer;
 4012         int line;
 4013         enum cache_fpl_status status:8;
 4014         bool in_smr;
 4015         bool fsearch;
 4016         bool savename;
 4017         struct pwd **pwd;
 4018 #ifdef INVARIANTS
 4019         struct cache_fpl_debug debug;
 4020 #endif
 4021 };
 4022 
 4023 static bool cache_fplookup_mp_supported(struct mount *mp);
 4024 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
 4025 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
 4026 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
 4027 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
 4028 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
 4029 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
 4030 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
 4031 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
 4032 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
 4033 
 4034 static void
 4035 cache_fpl_cleanup_cnp(struct componentname *cnp)
 4036 {
 4037 
 4038         uma_zfree(namei_zone, cnp->cn_pnbuf);
 4039 #ifdef DIAGNOSTIC
 4040         cnp->cn_pnbuf = NULL;
 4041         cnp->cn_nameptr = NULL;
 4042 #endif
 4043 }
 4044 
 4045 static struct vnode *
 4046 cache_fpl_handle_root(struct cache_fpl *fpl)
 4047 {
 4048         struct nameidata *ndp;
 4049         struct componentname *cnp;
 4050 
 4051         ndp = fpl->ndp;
 4052         cnp = fpl->cnp;
 4053 
 4054         MPASS(*(cnp->cn_nameptr) == '/');
 4055         cnp->cn_nameptr++;
 4056         cache_fpl_pathlen_dec(fpl);
 4057 
 4058         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 4059                 do {
 4060                         cnp->cn_nameptr++;
 4061                         cache_fpl_pathlen_dec(fpl);
 4062                 } while (*(cnp->cn_nameptr) == '/');
 4063         }
 4064 
 4065         return (ndp->ni_rootdir);
 4066 }
 4067 
 4068 static void
 4069 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
 4070 {
 4071 
 4072         fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
 4073         fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
 4074 }
 4075 
 4076 static void
 4077 cache_fpl_checkpoint(struct cache_fpl *fpl)
 4078 {
 4079 
 4080 #ifdef INVARIANTS
 4081         fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
 4082         fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
 4083 #endif
 4084 }
 4085 
 4086 static void
 4087 cache_fpl_restore_partial(struct cache_fpl *fpl)
 4088 {
 4089 
 4090         fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
 4091 #ifdef INVARIANTS
 4092         fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
 4093 #endif
 4094 }
 4095 
 4096 static void
 4097 cache_fpl_restore_abort(struct cache_fpl *fpl)
 4098 {
 4099 
 4100         cache_fpl_restore_partial(fpl);
 4101         /*
 4102          * It is 0 on entry by API contract.
 4103          */
 4104         fpl->ndp->ni_resflags = 0;
 4105         fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
 4106         fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
 4107 }
 4108 
 4109 #ifdef INVARIANTS
 4110 #define cache_fpl_smr_assert_entered(fpl) ({                    \
 4111         struct cache_fpl *_fpl = (fpl);                         \
 4112         MPASS(_fpl->in_smr == true);                            \
 4113         VFS_SMR_ASSERT_ENTERED();                               \
 4114 })
 4115 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
 4116         struct cache_fpl *_fpl = (fpl);                         \
 4117         MPASS(_fpl->in_smr == false);                           \
 4118         VFS_SMR_ASSERT_NOT_ENTERED();                           \
 4119 })
 4120 static void
 4121 cache_fpl_assert_status(struct cache_fpl *fpl)
 4122 {
 4123 
 4124         switch (fpl->status) {
 4125         case CACHE_FPL_STATUS_UNSET:
 4126                 __assert_unreachable();
 4127                 break;
 4128         case CACHE_FPL_STATUS_DESTROYED:
 4129         case CACHE_FPL_STATUS_ABORTED:
 4130         case CACHE_FPL_STATUS_PARTIAL:
 4131         case CACHE_FPL_STATUS_HANDLED:
 4132                 break;
 4133         }
 4134 }
 4135 #else
 4136 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
 4137 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
 4138 #define cache_fpl_assert_status(fpl) do { } while (0)
 4139 #endif
 4140 
 4141 #define cache_fpl_smr_enter_initial(fpl) ({                     \
 4142         struct cache_fpl *_fpl = (fpl);                         \
 4143         vfs_smr_enter();                                        \
 4144         _fpl->in_smr = true;                                    \
 4145 })
 4146 
 4147 #define cache_fpl_smr_enter(fpl) ({                             \
 4148         struct cache_fpl *_fpl = (fpl);                         \
 4149         MPASS(_fpl->in_smr == false);                           \
 4150         vfs_smr_enter();                                        \
 4151         _fpl->in_smr = true;                                    \
 4152 })
 4153 
 4154 #define cache_fpl_smr_exit(fpl) ({                              \
 4155         struct cache_fpl *_fpl = (fpl);                         \
 4156         MPASS(_fpl->in_smr == true);                            \
 4157         vfs_smr_exit();                                         \
 4158         _fpl->in_smr = false;                                   \
 4159 })
 4160 
 4161 static int
 4162 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
 4163 {
 4164 
 4165         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 4166                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 4167                     ("%s: converting to abort from %d at %d, set at %d\n",
 4168                     __func__, fpl->status, line, fpl->line));
 4169         }
 4170         cache_fpl_smr_assert_not_entered(fpl);
 4171         fpl->status = CACHE_FPL_STATUS_ABORTED;
 4172         fpl->line = line;
 4173         return (CACHE_FPL_FAILED);
 4174 }
 4175 
 4176 #define cache_fpl_aborted_early(x)      cache_fpl_aborted_early_impl((x), __LINE__)
 4177 
 4178 static int __noinline
 4179 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 4180 {
 4181         struct nameidata *ndp;
 4182         struct componentname *cnp;
 4183 
 4184         ndp = fpl->ndp;
 4185         cnp = fpl->cnp;
 4186 
 4187         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 4188                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 4189                     ("%s: converting to abort from %d at %d, set at %d\n",
 4190                     __func__, fpl->status, line, fpl->line));
 4191         }
 4192         fpl->status = CACHE_FPL_STATUS_ABORTED;
 4193         fpl->line = line;
 4194         if (fpl->in_smr)
 4195                 cache_fpl_smr_exit(fpl);
 4196         cache_fpl_restore_abort(fpl);
 4197         /*
 4198          * Resolving symlinks overwrites data passed by the caller.
 4199          * Let namei know.
 4200          */
 4201         if (ndp->ni_loopcnt > 0) {
 4202                 fpl->status = CACHE_FPL_STATUS_DESTROYED;
 4203                 cache_fpl_cleanup_cnp(cnp);
 4204         }
 4205         return (CACHE_FPL_FAILED);
 4206 }
 4207 
 4208 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
 4209 
 4210 static int __noinline
 4211 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
 4212 {
 4213 
 4214         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4215             ("%s: setting to partial at %d, but already set to %d at %d\n",
 4216             __func__, line, fpl->status, fpl->line));
 4217         cache_fpl_smr_assert_entered(fpl);
 4218         fpl->status = CACHE_FPL_STATUS_PARTIAL;
 4219         fpl->line = line;
 4220         return (cache_fplookup_partial_setup(fpl));
 4221 }
 4222 
 4223 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
 4224 
 4225 static int
 4226 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
 4227 {
 4228 
 4229         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4230             ("%s: setting to handled at %d, but already set to %d at %d\n",
 4231             __func__, line, fpl->status, fpl->line));
 4232         cache_fpl_smr_assert_not_entered(fpl);
 4233         fpl->status = CACHE_FPL_STATUS_HANDLED;
 4234         fpl->line = line;
 4235         return (0);
 4236 }
 4237 
 4238 #define cache_fpl_handled(x)    cache_fpl_handled_impl((x), __LINE__)
 4239 
 4240 static int
 4241 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
 4242 {
 4243 
 4244         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4245             ("%s: setting to handled at %d, but already set to %d at %d\n",
 4246             __func__, line, fpl->status, fpl->line));
 4247         MPASS(error != 0);
 4248         MPASS(error != CACHE_FPL_FAILED);
 4249         cache_fpl_smr_assert_not_entered(fpl);
 4250         fpl->status = CACHE_FPL_STATUS_HANDLED;
 4251         fpl->line = line;
 4252         fpl->dvp = NULL;
 4253         fpl->tvp = NULL;
 4254         fpl->savename = false;
 4255         return (error);
 4256 }
 4257 
 4258 #define cache_fpl_handled_error(x, e)   cache_fpl_handled_error_impl((x), (e), __LINE__)
 4259 
 4260 static bool
 4261 cache_fpl_terminated(struct cache_fpl *fpl)
 4262 {
 4263 
 4264         return (fpl->status != CACHE_FPL_STATUS_UNSET);
 4265 }
 4266 
 4267 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
 4268         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
 4269          FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | SAVENAME | SAVESTART | \
 4270          WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | \
 4271          WANTIOCTLCAPS)
 4272 
 4273 #define CACHE_FPL_INTERNAL_CN_FLAGS \
 4274         (ISDOTDOT | MAKEENTRY | ISLASTCN)
 4275 
 4276 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 4277     "supported and internal flags overlap");
 4278 
 4279 static bool
 4280 cache_fpl_islastcn(struct nameidata *ndp)
 4281 {
 4282 
 4283         return (*ndp->ni_next == 0);
 4284 }
 4285 
 4286 static bool
 4287 cache_fpl_istrailingslash(struct cache_fpl *fpl)
 4288 {
 4289 
 4290         MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
 4291         return (*(fpl->nulchar - 1) == '/');
 4292 }
 4293 
 4294 static bool
 4295 cache_fpl_isdotdot(struct componentname *cnp)
 4296 {
 4297 
 4298         if (cnp->cn_namelen == 2 &&
 4299             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 4300                 return (true);
 4301         return (false);
 4302 }
 4303 
 4304 static bool
 4305 cache_can_fplookup(struct cache_fpl *fpl)
 4306 {
 4307         struct nameidata *ndp;
 4308         struct componentname *cnp;
 4309         struct thread *td;
 4310 
 4311         ndp = fpl->ndp;
 4312         cnp = fpl->cnp;
 4313         td = cnp->cn_thread;
 4314 
 4315         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 4316                 cache_fpl_aborted_early(fpl);
 4317                 return (false);
 4318         }
 4319         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
 4320                 cache_fpl_aborted_early(fpl);
 4321                 return (false);
 4322         }
 4323         if (IN_CAPABILITY_MODE(td)) {
 4324                 cache_fpl_aborted_early(fpl);
 4325                 return (false);
 4326         }
 4327         if (AUDITING_TD(td)) {
 4328                 cache_fpl_aborted_early(fpl);
 4329                 return (false);
 4330         }
 4331         if (ndp->ni_startdir != NULL) {
 4332                 cache_fpl_aborted_early(fpl);
 4333                 return (false);
 4334         }
 4335         return (true);
 4336 }
 4337 
 4338 static int __noinline
 4339 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
 4340 {
 4341         struct nameidata *ndp;
 4342         struct componentname *cnp;
 4343         int error;
 4344         bool fsearch;
 4345 
 4346         ndp = fpl->ndp;
 4347         cnp = fpl->cnp;
 4348 
 4349         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
 4350         if (__predict_false(error != 0)) {
 4351                 return (cache_fpl_aborted(fpl));
 4352         }
 4353         fpl->fsearch = fsearch;
 4354         if ((*vpp)->v_type != VDIR) {
 4355                 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
 4356                         cache_fpl_smr_exit(fpl);
 4357                         return (cache_fpl_handled_error(fpl, ENOTDIR));
 4358                 }
 4359         }
 4360         return (0);
 4361 }
 4362 
 4363 static int __noinline
 4364 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
 4365     uint32_t hash)
 4366 {
 4367         struct componentname *cnp;
 4368         struct vnode *dvp;
 4369 
 4370         cnp = fpl->cnp;
 4371         dvp = fpl->dvp;
 4372 
 4373         cache_fpl_smr_exit(fpl);
 4374         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
 4375                 return (cache_fpl_handled_error(fpl, ENOENT));
 4376         else
 4377                 return (cache_fpl_aborted(fpl));
 4378 }
 4379 
 4380 /*
 4381  * The target vnode is not supported, prepare for the slow path to take over.
 4382  */
 4383 static int __noinline
 4384 cache_fplookup_partial_setup(struct cache_fpl *fpl)
 4385 {
 4386         struct nameidata *ndp;
 4387         struct componentname *cnp;
 4388         enum vgetstate dvs;
 4389         struct vnode *dvp;
 4390         struct pwd *pwd;
 4391         seqc_t dvp_seqc;
 4392 
 4393         ndp = fpl->ndp;
 4394         cnp = fpl->cnp;
 4395         pwd = *(fpl->pwd);
 4396         dvp = fpl->dvp;
 4397         dvp_seqc = fpl->dvp_seqc;
 4398 
 4399         if (!pwd_hold_smr(pwd)) {
 4400                 return (cache_fpl_aborted(fpl));
 4401         }
 4402 
 4403         /*
 4404          * Note that seqc is checked before the vnode is locked, so by
 4405          * the time regular lookup gets to it it may have moved.
 4406          *
 4407          * Ultimately this does not affect correctness, any lookup errors
 4408          * are userspace racing with itself. It is guaranteed that any
 4409          * path which ultimately gets found could also have been found
 4410          * by regular lookup going all the way in absence of concurrent
 4411          * modifications.
 4412          */
 4413         dvs = vget_prep_smr(dvp);
 4414         cache_fpl_smr_exit(fpl);
 4415         if (__predict_false(dvs == VGET_NONE)) {
 4416                 pwd_drop(pwd);
 4417                 return (cache_fpl_aborted(fpl));
 4418         }
 4419 
 4420         vget_finish_ref(dvp, dvs);
 4421         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4422                 vrele(dvp);
 4423                 pwd_drop(pwd);
 4424                 return (cache_fpl_aborted(fpl));
 4425         }
 4426 
 4427         cache_fpl_restore_partial(fpl);
 4428 #ifdef INVARIANTS
 4429         if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
 4430                 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
 4431                     cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
 4432         }
 4433 #endif
 4434 
 4435         ndp->ni_startdir = dvp;
 4436         cnp->cn_flags |= MAKEENTRY;
 4437         if (cache_fpl_islastcn(ndp))
 4438                 cnp->cn_flags |= ISLASTCN;
 4439         if (cache_fpl_isdotdot(cnp))
 4440                 cnp->cn_flags |= ISDOTDOT;
 4441 
 4442         /*
 4443          * Skip potential extra slashes parsing did not take care of.
 4444          * cache_fplookup_skip_slashes explains the mechanism.
 4445          */
 4446         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 4447                 do {
 4448                         cnp->cn_nameptr++;
 4449                         cache_fpl_pathlen_dec(fpl);
 4450                 } while (*(cnp->cn_nameptr) == '/');
 4451         }
 4452 
 4453         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 4454 #ifdef INVARIANTS
 4455         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 4456                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 4457                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 4458                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 4459         }
 4460 #endif
 4461         return (0);
 4462 }
 4463 
 4464 static int
 4465 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
 4466 {
 4467         struct componentname *cnp;
 4468         struct vnode *tvp;
 4469         seqc_t tvp_seqc;
 4470         int error, lkflags;
 4471 
 4472         cnp = fpl->cnp;
 4473         tvp = fpl->tvp;
 4474         tvp_seqc = fpl->tvp_seqc;
 4475 
 4476         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4477                 lkflags = LK_SHARED;
 4478                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4479                         lkflags = LK_EXCLUSIVE;
 4480                 error = vget_finish(tvp, lkflags, tvs);
 4481                 if (__predict_false(error != 0)) {
 4482                         return (cache_fpl_aborted(fpl));
 4483                 }
 4484         } else {
 4485                 vget_finish_ref(tvp, tvs);
 4486         }
 4487 
 4488         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 4489                 if ((cnp->cn_flags & LOCKLEAF) != 0)
 4490                         vput(tvp);
 4491                 else
 4492                         vrele(tvp);
 4493                 return (cache_fpl_aborted(fpl));
 4494         }
 4495 
 4496         return (cache_fpl_handled(fpl));
 4497 }
 4498 
 4499 /*
 4500  * They want to possibly modify the state of the namecache.
 4501  */
 4502 static int __noinline
 4503 cache_fplookup_final_modifying(struct cache_fpl *fpl)
 4504 {
 4505         struct nameidata *ndp;
 4506         struct componentname *cnp;
 4507         enum vgetstate dvs;
 4508         struct vnode *dvp, *tvp;
 4509         struct mount *mp;
 4510         seqc_t dvp_seqc;
 4511         int error;
 4512         bool docache;
 4513 
 4514         ndp = fpl->ndp;
 4515         cnp = fpl->cnp;
 4516         dvp = fpl->dvp;
 4517         dvp_seqc = fpl->dvp_seqc;
 4518 
 4519         MPASS(*(cnp->cn_nameptr) != '/');
 4520         MPASS(cache_fpl_islastcn(ndp));
 4521         if ((cnp->cn_flags & LOCKPARENT) == 0)
 4522                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
 4523         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
 4524         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
 4525             cnp->cn_nameiop == RENAME);
 4526         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4527         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4528 
 4529         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 4530         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
 4531                 docache = false;
 4532 
 4533         /*
 4534          * Regular lookup nulifies the slash, which we don't do here.
 4535          * Don't take chances with filesystem routines seeing it for
 4536          * the last entry.
 4537          */
 4538         if (cache_fpl_istrailingslash(fpl)) {
 4539                 return (cache_fpl_partial(fpl));
 4540         }
 4541 
 4542         mp = atomic_load_ptr(&dvp->v_mount);
 4543         if (__predict_false(mp == NULL)) {
 4544                 return (cache_fpl_aborted(fpl));
 4545         }
 4546 
 4547         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
 4548                 cache_fpl_smr_exit(fpl);
 4549                 /*
 4550                  * Original code keeps not checking for CREATE which
 4551                  * might be a bug. For now let the old lookup decide.
 4552                  */
 4553                 if (cnp->cn_nameiop == CREATE) {
 4554                         return (cache_fpl_aborted(fpl));
 4555                 }
 4556                 return (cache_fpl_handled_error(fpl, EROFS));
 4557         }
 4558 
 4559         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
 4560                 cache_fpl_smr_exit(fpl);
 4561                 return (cache_fpl_handled_error(fpl, EEXIST));
 4562         }
 4563 
 4564         /*
 4565          * Secure access to dvp; check cache_fplookup_partial_setup for
 4566          * reasoning.
 4567          *
 4568          * XXX At least UFS requires its lookup routine to be called for
 4569          * the last path component, which leads to some level of complication
 4570          * and inefficiency:
 4571          * - the target routine always locks the target vnode, but our caller
 4572          *   may not need it locked
 4573          * - some of the VOP machinery asserts that the parent is locked, which
 4574          *   once more may be not required
 4575          *
 4576          * TODO: add a flag for filesystems which don't need this.
 4577          */
 4578         dvs = vget_prep_smr(dvp);
 4579         cache_fpl_smr_exit(fpl);
 4580         if (__predict_false(dvs == VGET_NONE)) {
 4581                 return (cache_fpl_aborted(fpl));
 4582         }
 4583 
 4584         vget_finish_ref(dvp, dvs);
 4585         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4586                 vrele(dvp);
 4587                 return (cache_fpl_aborted(fpl));
 4588         }
 4589 
 4590         error = vn_lock(dvp, LK_EXCLUSIVE);
 4591         if (__predict_false(error != 0)) {
 4592                 vrele(dvp);
 4593                 return (cache_fpl_aborted(fpl));
 4594         }
 4595 
 4596         tvp = NULL;
 4597         cnp->cn_flags |= ISLASTCN;
 4598         if (docache)
 4599                 cnp->cn_flags |= MAKEENTRY;
 4600         if (cache_fpl_isdotdot(cnp))
 4601                 cnp->cn_flags |= ISDOTDOT;
 4602         cnp->cn_lkflags = LK_EXCLUSIVE;
 4603         error = VOP_LOOKUP(dvp, &tvp, cnp);
 4604         switch (error) {
 4605         case EJUSTRETURN:
 4606         case 0:
 4607                 break;
 4608         case ENOTDIR:
 4609         case ENOENT:
 4610                 vput(dvp);
 4611                 return (cache_fpl_handled_error(fpl, error));
 4612         default:
 4613                 vput(dvp);
 4614                 return (cache_fpl_aborted(fpl));
 4615         }
 4616 
 4617         fpl->tvp = tvp;
 4618         fpl->savename = (cnp->cn_flags & SAVENAME) != 0;
 4619 
 4620         if (tvp == NULL) {
 4621                 if ((cnp->cn_flags & SAVESTART) != 0) {
 4622                         ndp->ni_startdir = dvp;
 4623                         vrefact(ndp->ni_startdir);
 4624                         cnp->cn_flags |= SAVENAME;
 4625                         fpl->savename = true;
 4626                 }
 4627                 MPASS(error == EJUSTRETURN);
 4628                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4629                         VOP_UNLOCK(dvp);
 4630                 }
 4631                 return (cache_fpl_handled(fpl));
 4632         }
 4633 
 4634         /*
 4635          * There are very hairy corner cases concerning various flag combinations
 4636          * and locking state. In particular here we only hold one lock instead of
 4637          * two.
 4638          *
 4639          * Skip the complexity as it is of no significance for normal workloads.
 4640          */
 4641         if (__predict_false(tvp == dvp)) {
 4642                 vput(dvp);
 4643                 vrele(tvp);
 4644                 return (cache_fpl_aborted(fpl));
 4645         }
 4646 
 4647         /*
 4648          * If they want the symlink itself we are fine, but if they want to
 4649          * follow it regular lookup has to be engaged.
 4650          */
 4651         if (tvp->v_type == VLNK) {
 4652                 if ((cnp->cn_flags & FOLLOW) != 0) {
 4653                         vput(dvp);
 4654                         vput(tvp);
 4655                         return (cache_fpl_aborted(fpl));
 4656                 }
 4657         }
 4658 
 4659         /*
 4660          * Since we expect this to be the terminal vnode it should almost never
 4661          * be a mount point.
 4662          */
 4663         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 4664                 vput(dvp);
 4665                 vput(tvp);
 4666                 return (cache_fpl_aborted(fpl));
 4667         }
 4668 
 4669         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
 4670                 vput(dvp);
 4671                 vput(tvp);
 4672                 return (cache_fpl_handled_error(fpl, EEXIST));
 4673         }
 4674 
 4675         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 4676                 VOP_UNLOCK(tvp);
 4677         }
 4678 
 4679         if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4680                 VOP_UNLOCK(dvp);
 4681         }
 4682 
 4683         if ((cnp->cn_flags & SAVESTART) != 0) {
 4684                 ndp->ni_startdir = dvp;
 4685                 vrefact(ndp->ni_startdir);
 4686                 cnp->cn_flags |= SAVENAME;
 4687                 fpl->savename = true;
 4688         }
 4689 
 4690         return (cache_fpl_handled(fpl));
 4691 }
 4692 
 4693 static int __noinline
 4694 cache_fplookup_modifying(struct cache_fpl *fpl)
 4695 {
 4696         struct nameidata *ndp;
 4697 
 4698         ndp = fpl->ndp;
 4699 
 4700         if (!cache_fpl_islastcn(ndp)) {
 4701                 return (cache_fpl_partial(fpl));
 4702         }
 4703         return (cache_fplookup_final_modifying(fpl));
 4704 }
 4705 
 4706 static int __noinline
 4707 cache_fplookup_final_withparent(struct cache_fpl *fpl)
 4708 {
 4709         struct componentname *cnp;
 4710         enum vgetstate dvs, tvs;
 4711         struct vnode *dvp, *tvp;
 4712         seqc_t dvp_seqc;
 4713         int error;
 4714 
 4715         cnp = fpl->cnp;
 4716         dvp = fpl->dvp;
 4717         dvp_seqc = fpl->dvp_seqc;
 4718         tvp = fpl->tvp;
 4719 
 4720         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
 4721 
 4722         /*
 4723          * This is less efficient than it can be for simplicity.
 4724          */
 4725         dvs = vget_prep_smr(dvp);
 4726         if (__predict_false(dvs == VGET_NONE)) {
 4727                 return (cache_fpl_aborted(fpl));
 4728         }
 4729         tvs = vget_prep_smr(tvp);
 4730         if (__predict_false(tvs == VGET_NONE)) {
 4731                 cache_fpl_smr_exit(fpl);
 4732                 vget_abort(dvp, dvs);
 4733                 return (cache_fpl_aborted(fpl));
 4734         }
 4735 
 4736         cache_fpl_smr_exit(fpl);
 4737 
 4738         if ((cnp->cn_flags & LOCKPARENT) != 0) {
 4739                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
 4740                 if (__predict_false(error != 0)) {
 4741                         vget_abort(tvp, tvs);
 4742                         return (cache_fpl_aborted(fpl));
 4743                 }
 4744         } else {
 4745                 vget_finish_ref(dvp, dvs);
 4746         }
 4747 
 4748         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4749                 vget_abort(tvp, tvs);
 4750                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4751                         vput(dvp);
 4752                 else
 4753                         vrele(dvp);
 4754                 return (cache_fpl_aborted(fpl));
 4755         }
 4756 
 4757         error = cache_fplookup_final_child(fpl, tvs);
 4758         if (__predict_false(error != 0)) {
 4759                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
 4760                     fpl->status == CACHE_FPL_STATUS_DESTROYED);
 4761                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4762                         vput(dvp);
 4763                 else
 4764                         vrele(dvp);
 4765                 return (error);
 4766         }
 4767 
 4768         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
 4769         return (0);
 4770 }
 4771 
 4772 static int
 4773 cache_fplookup_final(struct cache_fpl *fpl)
 4774 {
 4775         struct componentname *cnp;
 4776         enum vgetstate tvs;
 4777         struct vnode *dvp, *tvp;
 4778         seqc_t dvp_seqc;
 4779 
 4780         cnp = fpl->cnp;
 4781         dvp = fpl->dvp;
 4782         dvp_seqc = fpl->dvp_seqc;
 4783         tvp = fpl->tvp;
 4784 
 4785         MPASS(*(cnp->cn_nameptr) != '/');
 4786 
 4787         if (cnp->cn_nameiop != LOOKUP) {
 4788                 return (cache_fplookup_final_modifying(fpl));
 4789         }
 4790 
 4791         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
 4792                 return (cache_fplookup_final_withparent(fpl));
 4793 
 4794         tvs = vget_prep_smr(tvp);
 4795         if (__predict_false(tvs == VGET_NONE)) {
 4796                 return (cache_fpl_partial(fpl));
 4797         }
 4798 
 4799         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4800                 cache_fpl_smr_exit(fpl);
 4801                 vget_abort(tvp, tvs);
 4802                 return (cache_fpl_aborted(fpl));
 4803         }
 4804 
 4805         cache_fpl_smr_exit(fpl);
 4806         return (cache_fplookup_final_child(fpl, tvs));
 4807 }
 4808 
 4809 /*
 4810  * Comment from locked lookup:
 4811  * Check for degenerate name (e.g. / or "") which is a way of talking about a
 4812  * directory, e.g. like "/." or ".".
 4813  */
 4814 static int __noinline
 4815 cache_fplookup_degenerate(struct cache_fpl *fpl)
 4816 {
 4817         struct componentname *cnp;
 4818         struct vnode *dvp;
 4819         enum vgetstate dvs;
 4820         int error, lkflags;
 4821 #ifdef INVARIANTS
 4822         char *cp;
 4823 #endif
 4824 
 4825         fpl->tvp = fpl->dvp;
 4826         fpl->tvp_seqc = fpl->dvp_seqc;
 4827 
 4828         cnp = fpl->cnp;
 4829         dvp = fpl->dvp;
 4830 
 4831 #ifdef INVARIANTS
 4832         for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
 4833                 KASSERT(*cp == '/',
 4834                     ("%s: encountered non-slash; string [%s]\n", __func__,
 4835                     cnp->cn_pnbuf));
 4836         }
 4837 #endif
 4838 
 4839         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
 4840                 cache_fpl_smr_exit(fpl);
 4841                 return (cache_fpl_handled_error(fpl, EISDIR));
 4842         }
 4843 
 4844         MPASS((cnp->cn_flags & SAVESTART) == 0);
 4845 
 4846         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
 4847                 return (cache_fplookup_final_withparent(fpl));
 4848         }
 4849 
 4850         dvs = vget_prep_smr(dvp);
 4851         cache_fpl_smr_exit(fpl);
 4852         if (__predict_false(dvs == VGET_NONE)) {
 4853                 return (cache_fpl_aborted(fpl));
 4854         }
 4855 
 4856         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4857                 lkflags = LK_SHARED;
 4858                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4859                         lkflags = LK_EXCLUSIVE;
 4860                 error = vget_finish(dvp, lkflags, dvs);
 4861                 if (__predict_false(error != 0)) {
 4862                         return (cache_fpl_aborted(fpl));
 4863                 }
 4864         } else {
 4865                 vget_finish_ref(dvp, dvs);
 4866         }
 4867         return (cache_fpl_handled(fpl));
 4868 }
 4869 
 4870 static int __noinline
 4871 cache_fplookup_emptypath(struct cache_fpl *fpl)
 4872 {
 4873         struct nameidata *ndp;
 4874         struct componentname *cnp;
 4875         enum vgetstate tvs;
 4876         struct vnode *tvp;
 4877         int error, lkflags;
 4878 
 4879         fpl->tvp = fpl->dvp;
 4880         fpl->tvp_seqc = fpl->dvp_seqc;
 4881 
 4882         ndp = fpl->ndp;
 4883         cnp = fpl->cnp;
 4884         tvp = fpl->tvp;
 4885 
 4886         MPASS(*cnp->cn_pnbuf == '\0');
 4887 
 4888         if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
 4889                 cache_fpl_smr_exit(fpl);
 4890                 return (cache_fpl_handled_error(fpl, ENOENT));
 4891         }
 4892 
 4893         MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
 4894 
 4895         tvs = vget_prep_smr(tvp);
 4896         cache_fpl_smr_exit(fpl);
 4897         if (__predict_false(tvs == VGET_NONE)) {
 4898                 return (cache_fpl_aborted(fpl));
 4899         }
 4900 
 4901         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4902                 lkflags = LK_SHARED;
 4903                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4904                         lkflags = LK_EXCLUSIVE;
 4905                 error = vget_finish(tvp, lkflags, tvs);
 4906                 if (__predict_false(error != 0)) {
 4907                         return (cache_fpl_aborted(fpl));
 4908                 }
 4909         } else {
 4910                 vget_finish_ref(tvp, tvs);
 4911         }
 4912 
 4913         ndp->ni_resflags |= NIRES_EMPTYPATH;
 4914         return (cache_fpl_handled(fpl));
 4915 }
 4916 
 4917 static int __noinline
 4918 cache_fplookup_noentry(struct cache_fpl *fpl)
 4919 {
 4920         struct nameidata *ndp;
 4921         struct componentname *cnp;
 4922         enum vgetstate dvs;
 4923         struct vnode *dvp, *tvp;
 4924         seqc_t dvp_seqc;
 4925         int error;
 4926 
 4927         ndp = fpl->ndp;
 4928         cnp = fpl->cnp;
 4929         dvp = fpl->dvp;
 4930         dvp_seqc = fpl->dvp_seqc;
 4931 
 4932         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4933         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4934         if (cnp->cn_nameiop == LOOKUP)
 4935                 MPASS((cnp->cn_flags & NOCACHE) == 0);
 4936         MPASS(!cache_fpl_isdotdot(cnp));
 4937 
 4938         /*
 4939          * Hack: delayed name len checking.
 4940          */
 4941         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 4942                 cache_fpl_smr_exit(fpl);
 4943                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 4944         }
 4945 
 4946         if (cnp->cn_nameptr[0] == '/') {
 4947                 return (cache_fplookup_skip_slashes(fpl));
 4948         }
 4949 
 4950         if (cnp->cn_pnbuf[0] == '\0') {
 4951                 return (cache_fplookup_emptypath(fpl));
 4952         }
 4953 
 4954         if (cnp->cn_nameptr[0] == '\0') {
 4955                 if (fpl->tvp == NULL) {
 4956                         return (cache_fplookup_degenerate(fpl));
 4957                 }
 4958                 return (cache_fplookup_trailingslash(fpl));
 4959         }
 4960 
 4961         if (cnp->cn_nameiop != LOOKUP) {
 4962                 fpl->tvp = NULL;
 4963                 return (cache_fplookup_modifying(fpl));
 4964         }
 4965 
 4966         MPASS((cnp->cn_flags & SAVESTART) == 0);
 4967 
 4968         /*
 4969          * Only try to fill in the component if it is the last one,
 4970          * otherwise not only there may be several to handle but the
 4971          * walk may be complicated.
 4972          */
 4973         if (!cache_fpl_islastcn(ndp)) {
 4974                 return (cache_fpl_partial(fpl));
 4975         }
 4976 
 4977         /*
 4978          * Regular lookup nulifies the slash, which we don't do here.
 4979          * Don't take chances with filesystem routines seeing it for
 4980          * the last entry.
 4981          */
 4982         if (cache_fpl_istrailingslash(fpl)) {
 4983                 return (cache_fpl_partial(fpl));
 4984         }
 4985 
 4986         /*
 4987          * Secure access to dvp; check cache_fplookup_partial_setup for
 4988          * reasoning.
 4989          */
 4990         dvs = vget_prep_smr(dvp);
 4991         cache_fpl_smr_exit(fpl);
 4992         if (__predict_false(dvs == VGET_NONE)) {
 4993                 return (cache_fpl_aborted(fpl));
 4994         }
 4995 
 4996         vget_finish_ref(dvp, dvs);
 4997         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4998                 vrele(dvp);
 4999                 return (cache_fpl_aborted(fpl));
 5000         }
 5001 
 5002         error = vn_lock(dvp, LK_SHARED);
 5003         if (__predict_false(error != 0)) {
 5004                 vrele(dvp);
 5005                 return (cache_fpl_aborted(fpl));
 5006         }
 5007 
 5008         tvp = NULL;
 5009         /*
 5010          * TODO: provide variants which don't require locking either vnode.
 5011          */
 5012         cnp->cn_flags |= ISLASTCN | MAKEENTRY;
 5013         cnp->cn_lkflags = LK_SHARED;
 5014         if ((cnp->cn_flags & LOCKSHARED) == 0) {
 5015                 cnp->cn_lkflags = LK_EXCLUSIVE;
 5016         }
 5017         error = VOP_LOOKUP(dvp, &tvp, cnp);
 5018         switch (error) {
 5019         case EJUSTRETURN:
 5020         case 0:
 5021                 break;
 5022         case ENOTDIR:
 5023         case ENOENT:
 5024                 vput(dvp);
 5025                 return (cache_fpl_handled_error(fpl, error));
 5026         default:
 5027                 vput(dvp);
 5028                 return (cache_fpl_aborted(fpl));
 5029         }
 5030 
 5031         fpl->tvp = tvp;
 5032         if (!fpl->savename) {
 5033                 MPASS((cnp->cn_flags & SAVENAME) == 0);
 5034         }
 5035 
 5036         if (tvp == NULL) {
 5037                 MPASS(error == EJUSTRETURN);
 5038                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 5039                         vput(dvp);
 5040                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 5041                         VOP_UNLOCK(dvp);
 5042                 }
 5043                 return (cache_fpl_handled(fpl));
 5044         }
 5045 
 5046         if (tvp->v_type == VLNK) {
 5047                 if ((cnp->cn_flags & FOLLOW) != 0) {
 5048                         vput(dvp);
 5049                         vput(tvp);
 5050                         return (cache_fpl_aborted(fpl));
 5051                 }
 5052         }
 5053 
 5054         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 5055                 vput(dvp);
 5056                 vput(tvp);
 5057                 return (cache_fpl_aborted(fpl));
 5058         }
 5059 
 5060         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 5061                 VOP_UNLOCK(tvp);
 5062         }
 5063 
 5064         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 5065                 vput(dvp);
 5066         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 5067                 VOP_UNLOCK(dvp);
 5068         }
 5069         return (cache_fpl_handled(fpl));
 5070 }
 5071 
 5072 static int __noinline
 5073 cache_fplookup_dot(struct cache_fpl *fpl)
 5074 {
 5075         int error;
 5076 
 5077         MPASS(!seqc_in_modify(fpl->dvp_seqc));
 5078         /*
 5079          * Just re-assign the value. seqc will be checked later for the first
 5080          * non-dot path component in line and/or before deciding to return the
 5081          * vnode.
 5082          */
 5083         fpl->tvp = fpl->dvp;
 5084         fpl->tvp_seqc = fpl->dvp_seqc;
 5085 
 5086         counter_u64_add(dothits, 1);
 5087         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
 5088 
 5089         error = 0;
 5090         if (cache_fplookup_is_mp(fpl)) {
 5091                 error = cache_fplookup_cross_mount(fpl);
 5092         }
 5093         return (error);
 5094 }
 5095 
 5096 static int __noinline
 5097 cache_fplookup_dotdot(struct cache_fpl *fpl)
 5098 {
 5099         struct nameidata *ndp;
 5100         struct componentname *cnp;
 5101         struct namecache *ncp;
 5102         struct vnode *dvp;
 5103         struct prison *pr;
 5104         u_char nc_flag;
 5105 
 5106         ndp = fpl->ndp;
 5107         cnp = fpl->cnp;
 5108         dvp = fpl->dvp;
 5109 
 5110         MPASS(cache_fpl_isdotdot(cnp));
 5111 
 5112         /*
 5113          * XXX this is racy the same way regular lookup is
 5114          */
 5115         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 5116             pr = pr->pr_parent)
 5117                 if (dvp == pr->pr_root)
 5118                         break;
 5119 
 5120         if (dvp == ndp->ni_rootdir ||
 5121             dvp == ndp->ni_topdir ||
 5122             dvp == rootvnode ||
 5123             pr != NULL) {
 5124                 fpl->tvp = dvp;
 5125                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
 5126                 if (seqc_in_modify(fpl->tvp_seqc)) {
 5127                         return (cache_fpl_aborted(fpl));
 5128                 }
 5129                 return (0);
 5130         }
 5131 
 5132         if ((dvp->v_vflag & VV_ROOT) != 0) {
 5133                 /*
 5134                  * TODO
 5135                  * The opposite of climb mount is needed here.
 5136                  */
 5137                 return (cache_fpl_partial(fpl));
 5138         }
 5139 
 5140         ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 5141         if (ncp == NULL) {
 5142                 return (cache_fpl_aborted(fpl));
 5143         }
 5144 
 5145         nc_flag = atomic_load_char(&ncp->nc_flag);
 5146         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 5147                 if ((nc_flag & NCF_NEGATIVE) != 0)
 5148                         return (cache_fpl_aborted(fpl));
 5149                 fpl->tvp = ncp->nc_vp;
 5150         } else {
 5151                 fpl->tvp = ncp->nc_dvp;
 5152         }
 5153 
 5154         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
 5155         if (seqc_in_modify(fpl->tvp_seqc)) {
 5156                 return (cache_fpl_partial(fpl));
 5157         }
 5158 
 5159         /*
 5160          * Acquire fence provided by vn_seqc_read_any above.
 5161          */
 5162         if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
 5163                 return (cache_fpl_aborted(fpl));
 5164         }
 5165 
 5166         if (!cache_ncp_canuse(ncp)) {
 5167                 return (cache_fpl_aborted(fpl));
 5168         }
 5169 
 5170         counter_u64_add(dotdothits, 1);
 5171         return (0);
 5172 }
 5173 
 5174 static int __noinline
 5175 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
 5176 {
 5177         u_char nc_flag __diagused;
 5178         bool neg_promote;
 5179 
 5180 #ifdef INVARIANTS
 5181         nc_flag = atomic_load_char(&ncp->nc_flag);
 5182         MPASS((nc_flag & NCF_NEGATIVE) != 0);
 5183 #endif
 5184         /*
 5185          * If they want to create an entry we need to replace this one.
 5186          */
 5187         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
 5188                 fpl->tvp = NULL;
 5189                 return (cache_fplookup_modifying(fpl));
 5190         }
 5191         neg_promote = cache_neg_hit_prep(ncp);
 5192         if (!cache_fpl_neg_ncp_canuse(ncp)) {
 5193                 cache_neg_hit_abort(ncp);
 5194                 return (cache_fpl_partial(fpl));
 5195         }
 5196         if (neg_promote) {
 5197                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
 5198         }
 5199         cache_neg_hit_finish(ncp);
 5200         cache_fpl_smr_exit(fpl);
 5201         return (cache_fpl_handled_error(fpl, ENOENT));
 5202 }
 5203 
 5204 /*
 5205  * Resolve a symlink. Called by filesystem-specific routines.
 5206  *
 5207  * Code flow is:
 5208  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
 5209  */
 5210 int
 5211 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
 5212 {
 5213         struct nameidata *ndp;
 5214         struct componentname *cnp;
 5215         size_t adjust;
 5216 
 5217         ndp = fpl->ndp;
 5218         cnp = fpl->cnp;
 5219 
 5220         if (__predict_false(len == 0)) {
 5221                 return (ENOENT);
 5222         }
 5223 
 5224         if (__predict_false(len > MAXPATHLEN - 2)) {
 5225                 if (cache_fpl_istrailingslash(fpl)) {
 5226                         return (EAGAIN);
 5227                 }
 5228         }
 5229 
 5230         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
 5231 #ifdef INVARIANTS
 5232         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 5233                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 5234                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 5235                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 5236         }
 5237 #endif
 5238 
 5239         if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
 5240                 return (ENAMETOOLONG);
 5241         }
 5242 
 5243         if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
 5244                 return (ELOOP);
 5245         }
 5246 
 5247         adjust = len;
 5248         if (ndp->ni_pathlen > 1) {
 5249                 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
 5250         } else {
 5251                 if (cache_fpl_istrailingslash(fpl)) {
 5252                         adjust = len + 1;
 5253                         cnp->cn_pnbuf[len] = '/';
 5254                         cnp->cn_pnbuf[len + 1] = '\0';
 5255                 } else {
 5256                         cnp->cn_pnbuf[len] = '\0';
 5257                 }
 5258         }
 5259         bcopy(string, cnp->cn_pnbuf, len);
 5260 
 5261         ndp->ni_pathlen += adjust;
 5262         cache_fpl_pathlen_add(fpl, adjust);
 5263         cnp->cn_nameptr = cnp->cn_pnbuf;
 5264         fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 5265         fpl->tvp = NULL;
 5266         return (0);
 5267 }
 5268 
 5269 static int __noinline
 5270 cache_fplookup_symlink(struct cache_fpl *fpl)
 5271 {
 5272         struct mount *mp;
 5273         struct nameidata *ndp;
 5274         struct componentname *cnp;
 5275         struct vnode *dvp, *tvp;
 5276         int error;
 5277 
 5278         ndp = fpl->ndp;
 5279         cnp = fpl->cnp;
 5280         dvp = fpl->dvp;
 5281         tvp = fpl->tvp;
 5282 
 5283         if (cache_fpl_islastcn(ndp)) {
 5284                 if ((cnp->cn_flags & FOLLOW) == 0) {
 5285                         return (cache_fplookup_final(fpl));
 5286                 }
 5287         }
 5288 
 5289         mp = atomic_load_ptr(&dvp->v_mount);
 5290         if (__predict_false(mp == NULL)) {
 5291                 return (cache_fpl_aborted(fpl));
 5292         }
 5293 
 5294         /*
 5295          * Note this check races against setting the flag just like regular
 5296          * lookup.
 5297          */
 5298         if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
 5299                 cache_fpl_smr_exit(fpl);
 5300                 return (cache_fpl_handled_error(fpl, EACCES));
 5301         }
 5302 
 5303         error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
 5304         if (__predict_false(error != 0)) {
 5305                 switch (error) {
 5306                 case EAGAIN:
 5307                         return (cache_fpl_partial(fpl));
 5308                 case ENOENT:
 5309                 case ENAMETOOLONG:
 5310                 case ELOOP:
 5311                         cache_fpl_smr_exit(fpl);
 5312                         return (cache_fpl_handled_error(fpl, error));
 5313                 default:
 5314                         return (cache_fpl_aborted(fpl));
 5315                 }
 5316         }
 5317 
 5318         if (*(cnp->cn_nameptr) == '/') {
 5319                 fpl->dvp = cache_fpl_handle_root(fpl);
 5320                 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 5321                 if (seqc_in_modify(fpl->dvp_seqc)) {
 5322                         return (cache_fpl_aborted(fpl));
 5323                 }
 5324                 /*
 5325                  * The main loop assumes that ->dvp points to a vnode belonging
 5326                  * to a filesystem which can do lockless lookup, but the absolute
 5327                  * symlink can be wandering off to one which does not.
 5328                  */
 5329                 mp = atomic_load_ptr(&fpl->dvp->v_mount);
 5330                 if (__predict_false(mp == NULL)) {
 5331                         return (cache_fpl_aborted(fpl));
 5332                 }
 5333                 if (!cache_fplookup_mp_supported(mp)) {
 5334                         cache_fpl_checkpoint(fpl);
 5335                         return (cache_fpl_partial(fpl));
 5336                 }
 5337         }
 5338         return (0);
 5339 }
 5340 
 5341 static int
 5342 cache_fplookup_next(struct cache_fpl *fpl)
 5343 {
 5344         struct componentname *cnp;
 5345         struct namecache *ncp;
 5346         struct vnode *dvp, *tvp;
 5347         u_char nc_flag;
 5348         uint32_t hash;
 5349         int error;
 5350 
 5351         cnp = fpl->cnp;
 5352         dvp = fpl->dvp;
 5353         hash = fpl->hash;
 5354 
 5355         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 5356                 if (cnp->cn_namelen == 1) {
 5357                         return (cache_fplookup_dot(fpl));
 5358                 }
 5359                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 5360                         return (cache_fplookup_dotdot(fpl));
 5361                 }
 5362         }
 5363 
 5364         MPASS(!cache_fpl_isdotdot(cnp));
 5365 
 5366         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 5367                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 5368                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 5369                         break;
 5370         }
 5371 
 5372         if (__predict_false(ncp == NULL)) {
 5373                 return (cache_fplookup_noentry(fpl));
 5374         }
 5375 
 5376         tvp = atomic_load_ptr(&ncp->nc_vp);
 5377         nc_flag = atomic_load_char(&ncp->nc_flag);
 5378         if ((nc_flag & NCF_NEGATIVE) != 0) {
 5379                 return (cache_fplookup_neg(fpl, ncp, hash));
 5380         }
 5381 
 5382         if (!cache_ncp_canuse(ncp)) {
 5383                 return (cache_fpl_partial(fpl));
 5384         }
 5385 
 5386         fpl->tvp = tvp;
 5387         fpl->tvp_seqc = vn_seqc_read_any(tvp);
 5388         if (seqc_in_modify(fpl->tvp_seqc)) {
 5389                 return (cache_fpl_partial(fpl));
 5390         }
 5391 
 5392         counter_u64_add(numposhits, 1);
 5393         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
 5394 
 5395         error = 0;
 5396         if (cache_fplookup_is_mp(fpl)) {
 5397                 error = cache_fplookup_cross_mount(fpl);
 5398         }
 5399         return (error);
 5400 }
 5401 
 5402 static bool
 5403 cache_fplookup_mp_supported(struct mount *mp)
 5404 {
 5405 
 5406         MPASS(mp != NULL);
 5407         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
 5408                 return (false);
 5409         return (true);
 5410 }
 5411 
 5412 /*
 5413  * Walk up the mount stack (if any).
 5414  *
 5415  * Correctness is provided in the following ways:
 5416  * - all vnodes are protected from freeing with SMR
 5417  * - struct mount objects are type stable making them always safe to access
 5418  * - stability of the particular mount is provided by busying it
 5419  * - relationship between the vnode which is mounted on and the mount is
 5420  *   verified with the vnode sequence counter after busying
 5421  * - association between root vnode of the mount and the mount is protected
 5422  *   by busy
 5423  *
 5424  * From that point on we can read the sequence counter of the root vnode
 5425  * and get the next mount on the stack (if any) using the same protection.
 5426  *
 5427  * By the end of successful walk we are guaranteed the reached state was
 5428  * indeed present at least at some point which matches the regular lookup.
 5429  */
 5430 static int __noinline
 5431 cache_fplookup_climb_mount(struct cache_fpl *fpl)
 5432 {
 5433         struct mount *mp, *prev_mp;
 5434         struct mount_pcpu *mpcpu, *prev_mpcpu;
 5435         struct vnode *vp;
 5436         seqc_t vp_seqc;
 5437 
 5438         vp = fpl->tvp;
 5439         vp_seqc = fpl->tvp_seqc;
 5440 
 5441         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
 5442         mp = atomic_load_ptr(&vp->v_mountedhere);
 5443         if (__predict_false(mp == NULL)) {
 5444                 return (0);
 5445         }
 5446 
 5447         prev_mp = NULL;
 5448         for (;;) {
 5449                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5450                         if (prev_mp != NULL)
 5451                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5452                         return (cache_fpl_partial(fpl));
 5453                 }
 5454                 if (prev_mp != NULL)
 5455                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5456                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 5457                         vfs_op_thread_exit_crit(mp, mpcpu);
 5458                         return (cache_fpl_partial(fpl));
 5459                 }
 5460                 if (!cache_fplookup_mp_supported(mp)) {
 5461                         vfs_op_thread_exit_crit(mp, mpcpu);
 5462                         return (cache_fpl_partial(fpl));
 5463                 }
 5464                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5465                 if (vp == NULL) {
 5466                         vfs_op_thread_exit_crit(mp, mpcpu);
 5467                         return (cache_fpl_partial(fpl));
 5468                 }
 5469                 vp_seqc = vn_seqc_read_any(vp);
 5470                 if (seqc_in_modify(vp_seqc)) {
 5471                         vfs_op_thread_exit_crit(mp, mpcpu);
 5472                         return (cache_fpl_partial(fpl));
 5473                 }
 5474                 prev_mp = mp;
 5475                 prev_mpcpu = mpcpu;
 5476                 mp = atomic_load_ptr(&vp->v_mountedhere);
 5477                 if (mp == NULL)
 5478                         break;
 5479         }
 5480 
 5481         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5482         fpl->tvp = vp;
 5483         fpl->tvp_seqc = vp_seqc;
 5484         return (0);
 5485 }
 5486 
 5487 static int __noinline
 5488 cache_fplookup_cross_mount(struct cache_fpl *fpl)
 5489 {
 5490         struct mount *mp;
 5491         struct mount_pcpu *mpcpu;
 5492         struct vnode *vp;
 5493         seqc_t vp_seqc;
 5494 
 5495         vp = fpl->tvp;
 5496         vp_seqc = fpl->tvp_seqc;
 5497 
 5498         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
 5499         mp = atomic_load_ptr(&vp->v_mountedhere);
 5500         if (__predict_false(mp == NULL)) {
 5501                 return (0);
 5502         }
 5503 
 5504         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5505                 return (cache_fpl_partial(fpl));
 5506         }
 5507         if (!vn_seqc_consistent(vp, vp_seqc)) {
 5508                 vfs_op_thread_exit_crit(mp, mpcpu);
 5509                 return (cache_fpl_partial(fpl));
 5510         }
 5511         if (!cache_fplookup_mp_supported(mp)) {
 5512                 vfs_op_thread_exit_crit(mp, mpcpu);
 5513                 return (cache_fpl_partial(fpl));
 5514         }
 5515         vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5516         if (__predict_false(vp == NULL)) {
 5517                 vfs_op_thread_exit_crit(mp, mpcpu);
 5518                 return (cache_fpl_partial(fpl));
 5519         }
 5520         vp_seqc = vn_seqc_read_any(vp);
 5521         vfs_op_thread_exit_crit(mp, mpcpu);
 5522         if (seqc_in_modify(vp_seqc)) {
 5523                 return (cache_fpl_partial(fpl));
 5524         }
 5525         mp = atomic_load_ptr(&vp->v_mountedhere);
 5526         if (__predict_false(mp != NULL)) {
 5527                 /*
 5528                  * There are possibly more mount points on top.
 5529                  * Normally this does not happen so for simplicity just start
 5530                  * over.
 5531                  */
 5532                 return (cache_fplookup_climb_mount(fpl));
 5533         }
 5534 
 5535         fpl->tvp = vp;
 5536         fpl->tvp_seqc = vp_seqc;
 5537         return (0);
 5538 }
 5539 
 5540 /*
 5541  * Check if a vnode is mounted on.
 5542  */
 5543 static bool
 5544 cache_fplookup_is_mp(struct cache_fpl *fpl)
 5545 {
 5546         struct vnode *vp;
 5547 
 5548         vp = fpl->tvp;
 5549         return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
 5550 }
 5551 
 5552 /*
 5553  * Parse the path.
 5554  *
 5555  * The code was originally copy-pasted from regular lookup and despite
 5556  * clean ups leaves performance on the table. Any modifications here
 5557  * must take into account that in case off fallback the resulting
 5558  * nameidata state has to be compatible with the original.
 5559  */
 5560 
 5561 /*
 5562  * Debug ni_pathlen tracking.
 5563  */
 5564 #ifdef INVARIANTS
 5565 static void
 5566 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5567 {
 5568 
 5569         fpl->debug.ni_pathlen += n;
 5570         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5571             ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5572 }
 5573 
 5574 static void
 5575 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5576 {
 5577 
 5578         fpl->debug.ni_pathlen -= n;
 5579         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5580             ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5581 }
 5582 
 5583 static void
 5584 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5585 {
 5586 
 5587         cache_fpl_pathlen_add(fpl, 1);
 5588 }
 5589 
 5590 static void
 5591 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5592 {
 5593 
 5594         cache_fpl_pathlen_sub(fpl, 1);
 5595 }
 5596 #else
 5597 static void
 5598 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5599 {
 5600 }
 5601 
 5602 static void
 5603 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5604 {
 5605 }
 5606 
 5607 static void
 5608 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5609 {
 5610 }
 5611 
 5612 static void
 5613 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5614 {
 5615 }
 5616 #endif
 5617 
 5618 static void
 5619 cache_fplookup_parse(struct cache_fpl *fpl)
 5620 {
 5621         struct nameidata *ndp;
 5622         struct componentname *cnp;
 5623         struct vnode *dvp;
 5624         char *cp;
 5625         uint32_t hash;
 5626 
 5627         ndp = fpl->ndp;
 5628         cnp = fpl->cnp;
 5629         dvp = fpl->dvp;
 5630 
 5631         /*
 5632          * Find the end of this path component, it is either / or nul.
 5633          *
 5634          * Store / as a temporary sentinel so that we only have one character
 5635          * to test for. Pathnames tend to be short so this should not be
 5636          * resulting in cache misses.
 5637          *
 5638          * TODO: fix this to be word-sized.
 5639          */
 5640         MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
 5641         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
 5642             ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
 5643             __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
 5644             fpl->nulchar, cnp->cn_pnbuf));
 5645         KASSERT(*fpl->nulchar == '\0',
 5646             ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
 5647             cnp->cn_pnbuf));
 5648         hash = cache_get_hash_iter_start(dvp);
 5649         *fpl->nulchar = '/';
 5650         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
 5651                 KASSERT(*cp != '\0',
 5652                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
 5653                     cnp->cn_nameptr));
 5654                 hash = cache_get_hash_iter(*cp, hash);
 5655                 continue;
 5656         }
 5657         *fpl->nulchar = '\0';
 5658         fpl->hash = cache_get_hash_iter_finish(hash);
 5659 
 5660         cnp->cn_namelen = cp - cnp->cn_nameptr;
 5661         cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
 5662 
 5663 #ifdef INVARIANTS
 5664         /*
 5665          * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
 5666          * we are going to fail this lookup with ENAMETOOLONG (see below).
 5667          */
 5668         if (cnp->cn_namelen <= NAME_MAX) {
 5669                 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
 5670                         panic("%s: mismatched hash for [%s] len %ld", __func__,
 5671                             cnp->cn_nameptr, cnp->cn_namelen);
 5672                 }
 5673         }
 5674 #endif
 5675 
 5676         /*
 5677          * Hack: we have to check if the found path component's length exceeds
 5678          * NAME_MAX. However, the condition is very rarely true and check can
 5679          * be elided in the common case -- if an entry was found in the cache,
 5680          * then it could not have been too long to begin with.
 5681          */
 5682         ndp->ni_next = cp;
 5683 }
 5684 
 5685 static void
 5686 cache_fplookup_parse_advance(struct cache_fpl *fpl)
 5687 {
 5688         struct nameidata *ndp;
 5689         struct componentname *cnp;
 5690 
 5691         ndp = fpl->ndp;
 5692         cnp = fpl->cnp;
 5693 
 5694         cnp->cn_nameptr = ndp->ni_next;
 5695         KASSERT(*(cnp->cn_nameptr) == '/',
 5696             ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
 5697             cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
 5698         cnp->cn_nameptr++;
 5699         cache_fpl_pathlen_dec(fpl);
 5700 }
 5701 
 5702 /*
 5703  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
 5704  *
 5705  * Lockless lookup tries to elide checking for spurious slashes and should they
 5706  * be present is guaranteed to fail to find an entry. In this case the caller
 5707  * must check if the name starts with a slash and call this routine.  It is
 5708  * going to fast forward across the spurious slashes and set the state up for
 5709  * retry.
 5710  */
 5711 static int __noinline
 5712 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
 5713 {
 5714         struct nameidata *ndp;
 5715         struct componentname *cnp;
 5716 
 5717         ndp = fpl->ndp;
 5718         cnp = fpl->cnp;
 5719 
 5720         MPASS(*(cnp->cn_nameptr) == '/');
 5721         do {
 5722                 cnp->cn_nameptr++;
 5723                 cache_fpl_pathlen_dec(fpl);
 5724         } while (*(cnp->cn_nameptr) == '/');
 5725 
 5726         /*
 5727          * Go back to one slash so that cache_fplookup_parse_advance has
 5728          * something to skip.
 5729          */
 5730         cnp->cn_nameptr--;
 5731         cache_fpl_pathlen_inc(fpl);
 5732 
 5733         /*
 5734          * cache_fplookup_parse_advance starts from ndp->ni_next
 5735          */
 5736         ndp->ni_next = cnp->cn_nameptr;
 5737 
 5738         /*
 5739          * See cache_fplookup_dot.
 5740          */
 5741         fpl->tvp = fpl->dvp;
 5742         fpl->tvp_seqc = fpl->dvp_seqc;
 5743 
 5744         return (0);
 5745 }
 5746 
 5747 /*
 5748  * Handle trailing slashes (e.g., "foo/").
 5749  *
 5750  * If a trailing slash is found the terminal vnode must be a directory.
 5751  * Regular lookup shortens the path by nulifying the first trailing slash and
 5752  * sets the TRAILINGSLASH flag to denote this took place. There are several
 5753  * checks on it performed later.
 5754  *
 5755  * Similarly to spurious slashes, lockless lookup handles this in a speculative
 5756  * manner relying on an invariant that a non-directory vnode will get a miss.
 5757  * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
 5758  *
 5759  * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
 5760  * and denotes this is the last path component, which avoids looping back.
 5761  *
 5762  * Only plain lookups are supported for now to restrict corner cases to handle.
 5763  */
 5764 static int __noinline
 5765 cache_fplookup_trailingslash(struct cache_fpl *fpl)
 5766 {
 5767 #ifdef INVARIANTS
 5768         size_t ni_pathlen;
 5769 #endif
 5770         struct nameidata *ndp;
 5771         struct componentname *cnp;
 5772         struct namecache *ncp;
 5773         struct vnode *tvp;
 5774         char *cn_nameptr_orig, *cn_nameptr_slash;
 5775         seqc_t tvp_seqc;
 5776         u_char nc_flag;
 5777 
 5778         ndp = fpl->ndp;
 5779         cnp = fpl->cnp;
 5780         tvp = fpl->tvp;
 5781         tvp_seqc = fpl->tvp_seqc;
 5782 
 5783         MPASS(fpl->dvp == fpl->tvp);
 5784         KASSERT(cache_fpl_istrailingslash(fpl),
 5785             ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
 5786             cnp->cn_pnbuf));
 5787         KASSERT(cnp->cn_nameptr[0] == '\0',
 5788             ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
 5789             cnp->cn_pnbuf));
 5790         KASSERT(cnp->cn_namelen == 0,
 5791             ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
 5792             cnp->cn_pnbuf));
 5793         MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
 5794 
 5795         if (cnp->cn_nameiop != LOOKUP) {
 5796                 return (cache_fpl_aborted(fpl));
 5797         }
 5798 
 5799         if (__predict_false(tvp->v_type != VDIR)) {
 5800                 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 5801                         return (cache_fpl_aborted(fpl));
 5802                 }
 5803                 cache_fpl_smr_exit(fpl);
 5804                 return (cache_fpl_handled_error(fpl, ENOTDIR));
 5805         }
 5806 
 5807         /*
 5808          * Denote the last component.
 5809          */
 5810         ndp->ni_next = &cnp->cn_nameptr[0];
 5811         MPASS(cache_fpl_islastcn(ndp));
 5812 
 5813         /*
 5814          * Unwind trailing slashes.
 5815          */
 5816         cn_nameptr_orig = cnp->cn_nameptr;
 5817         while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
 5818                 cnp->cn_nameptr--;
 5819                 if (cnp->cn_nameptr[0] != '/') {
 5820                         break;
 5821                 }
 5822         }
 5823 
 5824         /*
 5825          * Unwind to the beginning of the path component.
 5826          *
 5827          * Note the path may or may not have started with a slash.
 5828          */
 5829         cn_nameptr_slash = cnp->cn_nameptr;
 5830         while (cnp->cn_nameptr > cnp->cn_pnbuf) {
 5831                 cnp->cn_nameptr--;
 5832                 if (cnp->cn_nameptr[0] == '/') {
 5833                         break;
 5834                 }
 5835         }
 5836         if (cnp->cn_nameptr[0] == '/') {
 5837                 cnp->cn_nameptr++;
 5838         }
 5839 
 5840         cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
 5841         cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
 5842         cache_fpl_checkpoint(fpl);
 5843 
 5844 #ifdef INVARIANTS
 5845         ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 5846         if (ni_pathlen != fpl->debug.ni_pathlen) {
 5847                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 5848                     __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 5849                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 5850         }
 5851 #endif
 5852 
 5853         /*
 5854          * If this was a "./" lookup the parent directory is already correct.
 5855          */
 5856         if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
 5857                 return (0);
 5858         }
 5859 
 5860         /*
 5861          * Otherwise we need to look it up.
 5862          */
 5863         tvp = fpl->tvp;
 5864         ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
 5865         if (__predict_false(ncp == NULL)) {
 5866                 return (cache_fpl_aborted(fpl));
 5867         }
 5868         nc_flag = atomic_load_char(&ncp->nc_flag);
 5869         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 5870                 return (cache_fpl_aborted(fpl));
 5871         }
 5872         fpl->dvp = ncp->nc_dvp;
 5873         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 5874         if (seqc_in_modify(fpl->dvp_seqc)) {
 5875                 return (cache_fpl_aborted(fpl));
 5876         }
 5877         return (0);
 5878 }
 5879 
 5880 /*
 5881  * See the API contract for VOP_FPLOOKUP_VEXEC.
 5882  */
 5883 static int __noinline
 5884 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
 5885 {
 5886         struct componentname *cnp;
 5887         struct vnode *dvp;
 5888         seqc_t dvp_seqc;
 5889 
 5890         cnp = fpl->cnp;
 5891         dvp = fpl->dvp;
 5892         dvp_seqc = fpl->dvp_seqc;
 5893 
 5894         /*
 5895          * Hack: delayed empty path checking.
 5896          */
 5897         if (cnp->cn_pnbuf[0] == '\0') {
 5898                 return (cache_fplookup_emptypath(fpl));
 5899         }
 5900 
 5901         /*
 5902          * TODO: Due to ignoring trailing slashes lookup will perform a
 5903          * permission check on the last dir when it should not be doing it.  It
 5904          * may fail, but said failure should be ignored. It is possible to fix
 5905          * it up fully without resorting to regular lookup, but for now just
 5906          * abort.
 5907          */
 5908         if (cache_fpl_istrailingslash(fpl)) {
 5909                 return (cache_fpl_aborted(fpl));
 5910         }
 5911 
 5912         /*
 5913          * Hack: delayed degenerate path checking.
 5914          */
 5915         if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
 5916                 return (cache_fplookup_degenerate(fpl));
 5917         }
 5918 
 5919         /*
 5920          * Hack: delayed name len checking.
 5921          */
 5922         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 5923                 cache_fpl_smr_exit(fpl);
 5924                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 5925         }
 5926 
 5927         /*
 5928          * Hack: they may be looking up foo/bar, where foo is not a directory.
 5929          * In such a case we need to return ENOTDIR, but we may happen to get
 5930          * here with a different error.
 5931          */
 5932         if (dvp->v_type != VDIR) {
 5933                 error = ENOTDIR;
 5934         }
 5935 
 5936         /*
 5937          * Hack: handle O_SEARCH.
 5938          *
 5939          * Open Group Base Specifications Issue 7, 2018 edition states:
 5940          * <quote>
 5941          * If the access mode of the open file description associated with the
 5942          * file descriptor is not O_SEARCH, the function shall check whether
 5943          * directory searches are permitted using the current permissions of
 5944          * the directory underlying the file descriptor. If the access mode is
 5945          * O_SEARCH, the function shall not perform the check.
 5946          * </quote>
 5947          *
 5948          * Regular lookup tests for the NOEXECCHECK flag for every path
 5949          * component to decide whether to do the permission check. However,
 5950          * since most lookups never have the flag (and when they do it is only
 5951          * present for the first path component), lockless lookup only acts on
 5952          * it if there is a permission problem. Here the flag is represented
 5953          * with a boolean so that we don't have to clear it on the way out.
 5954          *
 5955          * For simplicity this always aborts.
 5956          * TODO: check if this is the first lookup and ignore the permission
 5957          * problem. Note the flag has to survive fallback (if it happens to be
 5958          * performed).
 5959          */
 5960         if (fpl->fsearch) {
 5961                 return (cache_fpl_aborted(fpl));
 5962         }
 5963 
 5964         switch (error) {
 5965         case EAGAIN:
 5966                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5967                         error = cache_fpl_aborted(fpl);
 5968                 } else {
 5969                         cache_fpl_partial(fpl);
 5970                 }
 5971                 break;
 5972         default:
 5973                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5974                         error = cache_fpl_aborted(fpl);
 5975                 } else {
 5976                         cache_fpl_smr_exit(fpl);
 5977                         cache_fpl_handled_error(fpl, error);
 5978                 }
 5979                 break;
 5980         }
 5981         return (error);
 5982 }
 5983 
 5984 static int
 5985 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 5986 {
 5987         struct nameidata *ndp;
 5988         struct componentname *cnp;
 5989         struct mount *mp;
 5990         int error;
 5991 
 5992         ndp = fpl->ndp;
 5993         cnp = fpl->cnp;
 5994 
 5995         cache_fpl_checkpoint(fpl);
 5996 
 5997         /*
 5998          * The vnode at hand is almost always stable, skip checking for it.
 5999          * Worst case this postpones the check towards the end of the iteration
 6000          * of the main loop.
 6001          */
 6002         fpl->dvp = dvp;
 6003         fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
 6004 
 6005         mp = atomic_load_ptr(&dvp->v_mount);
 6006         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
 6007                 return (cache_fpl_aborted(fpl));
 6008         }
 6009 
 6010         MPASS(fpl->tvp == NULL);
 6011 
 6012         for (;;) {
 6013                 cache_fplookup_parse(fpl);
 6014 
 6015                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
 6016                 if (__predict_false(error != 0)) {
 6017                         error = cache_fplookup_failed_vexec(fpl, error);
 6018                         break;
 6019                 }
 6020 
 6021                 error = cache_fplookup_next(fpl);
 6022                 if (__predict_false(cache_fpl_terminated(fpl))) {
 6023                         break;
 6024                 }
 6025 
 6026                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 6027 
 6028                 if (fpl->tvp->v_type == VLNK) {
 6029                         error = cache_fplookup_symlink(fpl);
 6030                         if (cache_fpl_terminated(fpl)) {
 6031                                 break;
 6032                         }
 6033                 } else {
 6034                         if (cache_fpl_islastcn(ndp)) {
 6035                                 error = cache_fplookup_final(fpl);
 6036                                 break;
 6037                         }
 6038 
 6039                         if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
 6040                                 error = cache_fpl_aborted(fpl);
 6041                                 break;
 6042                         }
 6043 
 6044                         fpl->dvp = fpl->tvp;
 6045                         fpl->dvp_seqc = fpl->tvp_seqc;
 6046                         cache_fplookup_parse_advance(fpl);
 6047                 }
 6048 
 6049                 cache_fpl_checkpoint(fpl);
 6050         }
 6051 
 6052         return (error);
 6053 }
 6054 
 6055 /*
 6056  * Fast path lookup protected with SMR and sequence counters.
 6057  *
 6058  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
 6059  *
 6060  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
 6061  * outlined below.
 6062  *
 6063  * Traditional vnode lookup conceptually looks like this:
 6064  *
 6065  * vn_lock(current);
 6066  * for (;;) {
 6067  *      next = find();
 6068  *      vn_lock(next);
 6069  *      vn_unlock(current);
 6070  *      current = next;
 6071  *      if (last)
 6072  *          break;
 6073  * }
 6074  * return (current);
 6075  *
 6076  * Each jump to the next vnode is safe memory-wise and atomic with respect to
 6077  * any modifications thanks to holding respective locks.
 6078  *
 6079  * The same guarantee can be provided with a combination of safe memory
 6080  * reclamation and sequence counters instead. If all operations which affect
 6081  * the relationship between the current vnode and the one we are looking for
 6082  * also modify the counter, we can verify whether all the conditions held as
 6083  * we made the jump. This includes things like permissions, mount points etc.
 6084  * Counter modification is provided by enclosing relevant places in
 6085  * vn_seqc_write_begin()/end() calls.
 6086  *
 6087  * Thus this translates to:
 6088  *
 6089  * vfs_smr_enter();
 6090  * dvp_seqc = seqc_read_any(dvp);
 6091  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
 6092  *     abort();
 6093  * for (;;) {
 6094  *      tvp = find();
 6095  *      tvp_seqc = seqc_read_any(tvp);
 6096  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
 6097  *          abort();
 6098  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
 6099  *          abort();
 6100  *      dvp = tvp; // we know nothing of importance has changed
 6101  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
 6102  *      if (last)
 6103  *          break;
 6104  * }
 6105  * vget(); // secure the vnode
 6106  * if (!seqc_consistent(tvp, tvp_seqc) // final check
 6107  *          abort();
 6108  * // at this point we know nothing has changed for any parent<->child pair
 6109  * // as they were crossed during the lookup, meaning we matched the guarantee
 6110  * // of the locked variant
 6111  * return (tvp);
 6112  *
 6113  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
 6114  * - they are called while within vfs_smr protection which they must never exit
 6115  * - EAGAIN can be returned to denote checking could not be performed, it is
 6116  *   always valid to return it
 6117  * - if the sequence counter has not changed the result must be valid
 6118  * - if the sequence counter has changed both false positives and false negatives
 6119  *   are permitted (since the result will be rejected later)
 6120  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
 6121  *
 6122  * Caveats to watch out for:
 6123  * - vnodes are passed unlocked and unreferenced with nothing stopping
 6124  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
 6125  *   to use atomic_load_ptr to fetch it.
 6126  * - the aforementioned object can also get freed, meaning absent other means it
 6127  *   should be protected with vfs_smr
 6128  * - either safely checking permissions as they are modified or guaranteeing
 6129  *   their stability is left to the routine
 6130  */
 6131 int
 6132 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
 6133     struct pwd **pwdp)
 6134 {
 6135         struct cache_fpl fpl;
 6136         struct pwd *pwd;
 6137         struct vnode *dvp;
 6138         struct componentname *cnp;
 6139         int error;
 6140 
 6141         fpl.status = CACHE_FPL_STATUS_UNSET;
 6142         fpl.in_smr = false;
 6143         fpl.ndp = ndp;
 6144         fpl.cnp = cnp = &ndp->ni_cnd;
 6145         MPASS(ndp->ni_lcf == 0);
 6146         MPASS(curthread == cnp->cn_thread);
 6147         KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 6148             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
 6149             cnp->cn_flags));
 6150         if ((cnp->cn_flags & SAVESTART) != 0) {
 6151                 MPASS(cnp->cn_nameiop != LOOKUP);
 6152         }
 6153         MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
 6154 
 6155         if (__predict_false(!cache_can_fplookup(&fpl))) {
 6156                 *status = fpl.status;
 6157                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 6158                 return (EOPNOTSUPP);
 6159         }
 6160 
 6161         cache_fpl_checkpoint_outer(&fpl);
 6162 
 6163         cache_fpl_smr_enter_initial(&fpl);
 6164 #ifdef INVARIANTS
 6165         fpl.debug.ni_pathlen = ndp->ni_pathlen;
 6166 #endif
 6167         fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 6168         fpl.fsearch = false;
 6169         fpl.savename = (cnp->cn_flags & SAVENAME) != 0;
 6170         fpl.tvp = NULL; /* for degenerate path handling */
 6171         fpl.pwd = pwdp;
 6172         pwd = pwd_get_smr();
 6173         *(fpl.pwd) = pwd;
 6174         ndp->ni_rootdir = pwd->pwd_rdir;
 6175         ndp->ni_topdir = pwd->pwd_jdir;
 6176 
 6177         if (cnp->cn_pnbuf[0] == '/') {
 6178                 dvp = cache_fpl_handle_root(&fpl);
 6179                 MPASS(ndp->ni_resflags == 0);
 6180                 ndp->ni_resflags = NIRES_ABS;
 6181         } else {
 6182                 if (ndp->ni_dirfd == AT_FDCWD) {
 6183                         dvp = pwd->pwd_cdir;
 6184                 } else {
 6185                         error = cache_fplookup_dirfd(&fpl, &dvp);
 6186                         if (__predict_false(error != 0)) {
 6187                                 goto out;
 6188                         }
 6189                 }
 6190         }
 6191 
 6192         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
 6193         error = cache_fplookup_impl(dvp, &fpl);
 6194 out:
 6195         cache_fpl_smr_assert_not_entered(&fpl);
 6196         cache_fpl_assert_status(&fpl);
 6197         *status = fpl.status;
 6198         if (SDT_PROBES_ENABLED()) {
 6199                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 6200                 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
 6201                         SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
 6202                             ndp);
 6203         }
 6204 
 6205         if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 6206                 MPASS(error != CACHE_FPL_FAILED);
 6207                 if (error != 0) {
 6208                         MPASS(fpl.dvp == NULL);
 6209                         MPASS(fpl.tvp == NULL);
 6210                         MPASS(fpl.savename == false);
 6211                 }
 6212                 ndp->ni_dvp = fpl.dvp;
 6213                 ndp->ni_vp = fpl.tvp;
 6214                 if (fpl.savename) {
 6215                         cnp->cn_flags |= HASBUF;
 6216                 } else {
 6217                         cache_fpl_cleanup_cnp(cnp);
 6218                 }
 6219         }
 6220         return (error);
 6221 }
Cache object: f77134c5d0ec3962ee86068d014180f5
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_cache.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c