vfs_cache.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993, 1995
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Poul-Henning Kamp of the FreeBSD Project.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_ddb.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/capsicum.h>
   46 #include <sys/counter.h>
   47 #include <sys/filedesc.h>
   48 #include <sys/fnv_hash.h>
   49 #include <sys/kernel.h>
   50 #include <sys/ktr.h>
   51 #include <sys/lock.h>
   52 #include <sys/malloc.h>
   53 #include <sys/fcntl.h>
   54 #include <sys/jail.h>
   55 #include <sys/mount.h>
   56 #include <sys/namei.h>
   57 #include <sys/proc.h>
   58 #include <sys/seqc.h>
   59 #include <sys/sdt.h>
   60 #include <sys/smr.h>
   61 #include <sys/smp.h>
   62 #include <sys/syscallsubr.h>
   63 #include <sys/sysctl.h>
   64 #include <sys/sysproto.h>
   65 #include <sys/vnode.h>
   66 #include <ck_queue.h>
   67 #ifdef KTRACE
   68 #include <sys/ktrace.h>
   69 #endif
   70 #ifdef INVARIANTS
   71 #include <machine/_inttypes.h>
   72 #endif
   73 
   74 #include <sys/capsicum.h>
   75 
   76 #include <security/audit/audit.h>
   77 #include <security/mac/mac_framework.h>
   78 
   79 #ifdef DDB
   80 #include <ddb/ddb.h>
   81 #endif
   82 
   83 #include <vm/uma.h>
   84 
   85 /*
   86  * High level overview of name caching in the VFS layer.
   87  *
   88  * Originally caching was implemented as part of UFS, later extracted to allow
   89  * use by other filesystems. A decision was made to make it optional and
   90  * completely detached from the rest of the kernel, which comes with limitations
   91  * outlined near the end of this comment block.
   92  *
   93  * This fundamental choice needs to be revisited. In the meantime, the current
   94  * state is described below. Significance of all notable routines is explained
   95  * in comments placed above their implementation. Scattered thoroughout the
   96  * file are TODO comments indicating shortcomings which can be fixed without
   97  * reworking everything (most of the fixes will likely be reusable). Various
   98  * details are omitted from this explanation to not clutter the overview, they
   99  * have to be checked by reading the code and associated commentary.
  100  *
  101  * Keep in mind that it's individual path components which are cached, not full
  102  * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
  103  * one for each name.
  104  *
  105  * I. Data organization
  106  *
  107  * Entries are described by "struct namecache" objects and stored in a hash
  108  * table. See cache_get_hash for more information.
  109  *
  110  * "struct vnode" contains pointers to source entries (names which can be found
  111  * when traversing through said vnode), destination entries (names of that
  112  * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
  113  * the parent vnode.
  114  *
  115  * The (directory vnode; name) tuple reliably determines the target entry if
  116  * it exists.
  117  *
  118  * Since there are no small locks at this time (all are 32 bytes in size on
  119  * LP64), the code works around the problem by introducing lock arrays to
  120  * protect hash buckets and vnode lists.
  121  *
  122  * II. Filesystem integration
  123  *
  124  * Filesystems participating in name caching do the following:
  125  * - set vop_lookup routine to vfs_cache_lookup
  126  * - set vop_cachedlookup to whatever can perform the lookup if the above fails
  127  * - if they support lockless lookup (see below), vop_fplookup_vexec and
  128  *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
  129  *   mount point
  130  * - call cache_purge or cache_vop_* routines to eliminate stale entries as
  131  *   applicable
  132  * - call cache_enter to add entries depending on the MAKEENTRY flag
  133  *
  134  * With the above in mind, there are 2 entry points when doing lookups:
  135  * - ... -> namei -> cache_fplookup -- this is the default
  136  * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
  137  *   should the above fail
  138  *
  139  * Example code flow how an entry is added:
  140  * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
  141  * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
  142  *
  143  * III. Performance considerations
  144  *
  145  * For lockless case forward lookup avoids any writes to shared areas apart
  146  * from the terminal path component. In other words non-modifying lookups of
  147  * different files don't suffer any scalability problems in the namecache.
  148  * Looking up the same file is limited by VFS and goes beyond the scope of this
  149  * file.
  150  *
  151  * At least on amd64 the single-threaded bottleneck for long paths is hashing
  152  * (see cache_get_hash). There are cases where the code issues acquire fence
  153  * multiple times, they can be combined on architectures which suffer from it.
  154  *
  155  * For locked case each encountered vnode has to be referenced and locked in
  156  * order to be handed out to the caller (normally that's namei). This
  157  * introduces significant hit single-threaded and serialization multi-threaded.
  158  *
  159  * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
  160  * avoids any writes to shared areas to any components.
  161  *
  162  * Unrelated insertions are partially serialized on updating the global entry
  163  * counter and possibly serialized on colliding bucket or vnode locks.
  164  *
  165  * IV. Observability
  166  *
  167  * Note not everything has an explicit dtrace probe nor it should have, thus
  168  * some of the one-liners below depend on implementation details.
  169  *
  170  * Examples:
  171  *
  172  * # Check what lookups failed to be handled in a lockless manner. Column 1 is
  173  * # line number, column 2 is status code (see cache_fpl_status)
  174  * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
  175  *
  176  * # Lengths of names added by binary name
  177  * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
  178  *
  179  * # Same as above but only those which exceed 64 characters
  180  * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
  181  *
  182  * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
  183  * # path is it
  184  * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
  185  *
  186  * V. Limitations and implementation defects
  187  *
  188  * - since it is possible there is no entry for an open file, tools like
  189  *   "procstat" may fail to resolve fd -> vnode -> path to anything
  190  * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
  191  *   shortage) in which case the above problem applies
  192  * - hardlinks are not tracked, thus if a vnode is reachable in more than one
  193  *   way, resolving a name may return a different path than the one used to
  194  *   open it (even if said path is still valid)
  195  * - by default entries are not added for newly created files
  196  * - adding an entry may need to evict negative entry first, which happens in 2
  197  *   distinct places (evicting on lookup, adding in a later VOP) making it
  198  *   impossible to simply reuse it
  199  * - there is a simple scheme to evict negative entries as the cache is approaching
  200  *   its capacity, but it is very unclear if doing so is a good idea to begin with
  201  * - vnodes are subject to being recycled even if target inode is left in memory,
  202  *   which loses the name cache entries when it perhaps should not. in case of tmpfs
  203  *   names get duplicated -- kept by filesystem itself and namecache separately
  204  * - struct namecache has a fixed size and comes in 2 variants, often wasting space.
  205  *   now hard to replace with malloc due to dependence on SMR.
  206  * - lack of better integration with the kernel also turns nullfs into a layered
  207  *   filesystem instead of something which can take advantage of caching
  208  */
  209 
  210 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  211     "Name cache");
  212 
  213 SDT_PROVIDER_DECLARE(vfs);
  214 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  215     "struct vnode *");
  216 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  217     "struct vnode *");
  218 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  219     "char *");
  220 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  221     "const char *");
  222 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  223     "struct namecache *", "int", "int");
  224 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  225 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  226     "char *", "struct vnode *");
  227 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  228 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  229     "struct vnode *", "char *");
  230 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  231     "struct vnode *");
  232 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
  233     "struct vnode *", "char *");
  234 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
  235     "char *");
  236 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
  237     "struct componentname *");
  238 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
  239     "struct componentname *");
  240 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
  241 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
  242 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
  243 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
  244 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
  245     "struct vnode *");
  246 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
  247     "char *");
  248 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
  249     "char *");
  250 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
  251 
  252 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
  253 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
  254 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
  255 
  256 /*
  257  * This structure describes the elements in the cache of recent
  258  * names looked up by namei.
  259  */
  260 struct negstate {
  261         u_char neg_flag;
  262         u_char neg_hit;
  263 };
  264 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
  265     "the state must fit in a union with a pointer without growing it");
  266 
  267 struct  namecache {
  268         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  269         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  270         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
  271         struct  vnode *nc_dvp;          /* vnode of parent of name */
  272         union {
  273                 struct  vnode *nu_vp;   /* vnode the name refers to */
  274                 struct  negstate nu_neg;/* negative entry state */
  275         } n_un;
  276         u_char  nc_flag;                /* flag bits */
  277         u_char  nc_nlen;                /* length of name */
  278         char    nc_name[0];             /* segment name + nul */
  279 };
  280 
  281 /*
  282  * struct namecache_ts repeats struct namecache layout up to the
  283  * nc_nlen member.
  284  * struct namecache_ts is used in place of struct namecache when time(s) need
  285  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  286  * both a non-dotdot directory name plus dotdot for the directory's
  287  * parent.
  288  *
  289  * See below for alignment requirement.
  290  */
  291 struct  namecache_ts {
  292         struct  timespec nc_time;       /* timespec provided by fs */
  293         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
  294         int     nc_ticks;               /* ticks value when entry was added */
  295         int     nc_pad;
  296         struct namecache nc_nc;
  297 };
  298 
  299 TAILQ_HEAD(cache_freebatch, namecache);
  300 
  301 /*
  302  * At least mips n32 performs 64-bit accesses to timespec as found
  303  * in namecache_ts and requires them to be aligned. Since others
  304  * may be in the same spot suffer a little bit and enforce the
  305  * alignment for everyone. Note this is a nop for 64-bit platforms.
  306  */
  307 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
  308 
  309 /*
  310  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
  311  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
  312  * smaller and the value was bumped to retain the total size, but it
  313  * was never re-evaluated for suitability. A simple test counting
  314  * lengths during package building shows that the value of 45 covers
  315  * about 86% of all added entries, reaching 99% at 65.
  316  *
  317  * Regardless of the above, use of dedicated zones instead of malloc may be
  318  * inducing additional waste. This may be hard to address as said zones are
  319  * tied to VFS SMR. Even if retaining them, the current split should be
  320  * re-evaluated.
  321  */
  322 #ifdef __LP64__
  323 #define CACHE_PATH_CUTOFF       45
  324 #define CACHE_LARGE_PAD         6
  325 #else
  326 #define CACHE_PATH_CUTOFF       41
  327 #define CACHE_LARGE_PAD         2
  328 #endif
  329 
  330 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
  331 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
  332 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
  333 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
  334 
  335 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  336 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  337 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  338 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
  339 
  340 #define nc_vp           n_un.nu_vp
  341 #define nc_neg          n_un.nu_neg
  342 
  343 /*
  344  * Flags in namecache.nc_flag
  345  */
  346 #define NCF_WHITE       0x01
  347 #define NCF_ISDOTDOT    0x02
  348 #define NCF_TS          0x04
  349 #define NCF_DTS         0x08
  350 #define NCF_DVDROP      0x10
  351 #define NCF_NEGATIVE    0x20
  352 #define NCF_INVALID     0x40
  353 #define NCF_WIP         0x80
  354 
  355 /*
  356  * Flags in negstate.neg_flag
  357  */
  358 #define NEG_HOT         0x01
  359 
  360 static bool     cache_neg_evict_cond(u_long lnumcache);
  361 
  362 /*
  363  * Mark an entry as invalid.
  364  *
  365  * This is called before it starts getting deconstructed.
  366  */
  367 static void
  368 cache_ncp_invalidate(struct namecache *ncp)
  369 {
  370 
  371         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
  372             ("%s: entry %p already invalid", __func__, ncp));
  373         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
  374         atomic_thread_fence_rel();
  375 }
  376 
  377 /*
  378  * Check whether the entry can be safely used.
  379  *
  380  * All places which elide locks are supposed to call this after they are
  381  * done with reading from an entry.
  382  */
  383 #define cache_ncp_canuse(ncp)   ({                                      \
  384         struct namecache *_ncp = (ncp);                                 \
  385         u_char _nc_flag;                                                \
  386                                                                         \
  387         atomic_thread_fence_acq();                                      \
  388         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  389         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
  390 })
  391 
  392 /*
  393  * Like the above but also checks NCF_WHITE.
  394  */
  395 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
  396         struct namecache *_ncp = (ncp);                                 \
  397         u_char _nc_flag;                                                \
  398                                                                         \
  399         atomic_thread_fence_acq();                                      \
  400         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
  401         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
  402 })
  403 
  404 VFS_SMR_DECLARE;
  405 
  406 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  407     "Name cache parameters");
  408 
  409 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
  410 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
  411     "Total namecache capacity");
  412 
  413 u_int ncsizefactor = 2;
  414 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
  415     "Size factor for namecache");
  416 
  417 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
  418 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
  419     "Ratio of negative namecache entries");
  420 
  421 /*
  422  * Negative entry % of namecache capacity above which automatic eviction is allowed.
  423  *
  424  * Check cache_neg_evict_cond for details.
  425  */
  426 static u_int ncnegminpct = 3;
  427 
  428 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
  429 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
  430     "Negative entry count above which automatic eviction is allowed");
  431 
  432 /*
  433  * Structures associated with name caching.
  434  */
  435 #define NCHHASH(hash) \
  436         (&nchashtbl[(hash) & nchash])
  437 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
  438 static u_long __read_mostly     nchash;                 /* size of hash table */
  439 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  440     "Size of namecache hash table");
  441 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
  442 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
  443 
  444 struct nchstats nchstats;               /* cache effectiveness statistics */
  445 
  446 static bool __read_frequently cache_fast_revlookup = true;
  447 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
  448     &cache_fast_revlookup, 0, "");
  449 
  450 static bool __read_mostly cache_rename_add = true;
  451 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW,
  452     &cache_rename_add, 0, "");
  453 
  454 static u_int __exclusive_cache_line neg_cycle;
  455 
  456 #define ncneghash       3
  457 #define numneglists     (ncneghash + 1)
  458 
  459 struct neglist {
  460         struct mtx              nl_evict_lock;
  461         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
  462         TAILQ_HEAD(, namecache) nl_list;
  463         TAILQ_HEAD(, namecache) nl_hotlist;
  464         u_long                  nl_hotnum;
  465 } __aligned(CACHE_LINE_SIZE);
  466 
  467 static struct neglist neglists[numneglists];
  468 
  469 static inline struct neglist *
  470 NCP2NEGLIST(struct namecache *ncp)
  471 {
  472 
  473         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
  474 }
  475 
  476 static inline struct negstate *
  477 NCP2NEGSTATE(struct namecache *ncp)
  478 {
  479 
  480         MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
  481         return (&ncp->nc_neg);
  482 }
  483 
  484 #define numbucketlocks (ncbuckethash + 1)
  485 static u_int __read_mostly  ncbuckethash;
  486 static struct mtx_padalign __read_mostly  *bucketlocks;
  487 #define HASH2BUCKETLOCK(hash) \
  488         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
  489 
  490 #define numvnodelocks (ncvnodehash + 1)
  491 static u_int __read_mostly  ncvnodehash;
  492 static struct mtx __read_mostly *vnodelocks;
  493 static inline struct mtx *
  494 VP2VNODELOCK(struct vnode *vp)
  495 {
  496 
  497         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
  498 }
  499 
  500 static void
  501 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
  502 {
  503         struct namecache_ts *ncp_ts;
  504 
  505         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
  506             (tsp == NULL && ticksp == NULL),
  507             ("No NCF_TS"));
  508 
  509         if (tsp == NULL)
  510                 return;
  511 
  512         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  513         *tsp = ncp_ts->nc_time;
  514         *ticksp = ncp_ts->nc_ticks;
  515 }
  516 
  517 #ifdef DEBUG_CACHE
  518 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
  519 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
  520     "VFS namecache enabled");
  521 #endif
  522 
  523 /* Export size information to userland */
  524 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
  525     sizeof(struct namecache), "sizeof(struct namecache)");
  526 
  527 /*
  528  * The new name cache statistics
  529  */
  530 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  531     "Name cache statistics");
  532 
  533 #define STATNODE_ULONG(name, varname, descr)                                    \
  534         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  535 #define STATNODE_COUNTER(name, varname, descr)                                  \
  536         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  537         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
  538             descr);
  539 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
  540 STATNODE_ULONG(count, numcache, "Number of cache entries");
  541 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
  542 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
  543 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
  544 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
  545 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
  546 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
  547 STATNODE_COUNTER(posszaps, numposzaps,
  548     "Number of cache hits (positive) we do not want to cache");
  549 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
  550 STATNODE_COUNTER(negzaps, numnegzaps,
  551     "Number of cache hits (negative) we do not want to cache");
  552 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
  553 /* These count for vn_getcwd(), too. */
  554 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
  555 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
  556 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
  557     "Number of fullpath search errors (VOP_VPTOCNP failures)");
  558 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
  559 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
  560 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
  561 
  562 /*
  563  * Debug or developer statistics.
  564  */
  565 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  566     "Name cache debugging");
  567 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
  568         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
  569 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
  570         static COUNTER_U64_DEFINE_EARLY(varname);                               \
  571         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
  572             descr);
  573 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
  574     "Number of successful removals after relocking");
  575 static long zap_bucket_fail;
  576 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
  577 static long zap_bucket_fail2;
  578 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
  579 static long cache_lock_vnodes_cel_3_failures;
  580 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
  581     "Number of times 3-way vnode locking failed");
  582 
  583 static void cache_zap_locked(struct namecache *ncp);
  584 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
  585     char **retbuf, size_t *buflen, size_t addend);
  586 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
  587     char **retbuf, size_t *buflen);
  588 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
  589     char **retbuf, size_t *len, size_t addend);
  590 
  591 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  592 
  593 static inline void
  594 cache_assert_vlp_locked(struct mtx *vlp)
  595 {
  596 
  597         if (vlp != NULL)
  598                 mtx_assert(vlp, MA_OWNED);
  599 }
  600 
  601 static inline void
  602 cache_assert_vnode_locked(struct vnode *vp)
  603 {
  604         struct mtx *vlp;
  605 
  606         vlp = VP2VNODELOCK(vp);
  607         cache_assert_vlp_locked(vlp);
  608 }
  609 
  610 /*
  611  * Directory vnodes with entries are held for two reasons:
  612  * 1. make them less of a target for reclamation in vnlru
  613  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
  614  *
  615  * It will be feasible to stop doing it altogether if all filesystems start
  616  * supporting lockless lookup.
  617  */
  618 static void
  619 cache_hold_vnode(struct vnode *vp)
  620 {
  621 
  622         cache_assert_vnode_locked(vp);
  623         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
  624         vhold(vp);
  625         counter_u64_add(numcachehv, 1);
  626 }
  627 
  628 static void
  629 cache_drop_vnode(struct vnode *vp)
  630 {
  631 
  632         /*
  633          * Called after all locks are dropped, meaning we can't assert
  634          * on the state of v_cache_src.
  635          */
  636         vdrop(vp);
  637         counter_u64_add(numcachehv, -1);
  638 }
  639 
  640 /*
  641  * UMA zones.
  642  */
  643 static uma_zone_t __read_mostly cache_zone_small;
  644 static uma_zone_t __read_mostly cache_zone_small_ts;
  645 static uma_zone_t __read_mostly cache_zone_large;
  646 static uma_zone_t __read_mostly cache_zone_large_ts;
  647 
  648 char *
  649 cache_symlink_alloc(size_t size, int flags)
  650 {
  651 
  652         if (size < CACHE_ZONE_SMALL_SIZE) {
  653                 return (uma_zalloc_smr(cache_zone_small, flags));
  654         }
  655         if (size < CACHE_ZONE_LARGE_SIZE) {
  656                 return (uma_zalloc_smr(cache_zone_large, flags));
  657         }
  658         counter_u64_add(symlinktoobig, 1);
  659         SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
  660         return (NULL);
  661 }
  662 
  663 void
  664 cache_symlink_free(char *string, size_t size)
  665 {
  666 
  667         MPASS(string != NULL);
  668         KASSERT(size < CACHE_ZONE_LARGE_SIZE,
  669             ("%s: size %zu too big", __func__, size));
  670 
  671         if (size < CACHE_ZONE_SMALL_SIZE) {
  672                 uma_zfree_smr(cache_zone_small, string);
  673                 return;
  674         }
  675         if (size < CACHE_ZONE_LARGE_SIZE) {
  676                 uma_zfree_smr(cache_zone_large, string);
  677                 return;
  678         }
  679         __assert_unreachable();
  680 }
  681 
  682 static struct namecache *
  683 cache_alloc_uma(int len, bool ts)
  684 {
  685         struct namecache_ts *ncp_ts;
  686         struct namecache *ncp;
  687 
  688         if (__predict_false(ts)) {
  689                 if (len <= CACHE_PATH_CUTOFF)
  690                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
  691                 else
  692                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
  693                 ncp = &ncp_ts->nc_nc;
  694         } else {
  695                 if (len <= CACHE_PATH_CUTOFF)
  696                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
  697                 else
  698                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
  699         }
  700         return (ncp);
  701 }
  702 
  703 static void
  704 cache_free_uma(struct namecache *ncp)
  705 {
  706         struct namecache_ts *ncp_ts;
  707 
  708         if (__predict_false(ncp->nc_flag & NCF_TS)) {
  709                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  710                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  711                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
  712                 else
  713                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
  714         } else {
  715                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  716                         uma_zfree_smr(cache_zone_small, ncp);
  717                 else
  718                         uma_zfree_smr(cache_zone_large, ncp);
  719         }
  720 }
  721 
  722 static struct namecache *
  723 cache_alloc(int len, bool ts)
  724 {
  725         u_long lnumcache;
  726 
  727         /*
  728          * Avoid blowout in namecache entries.
  729          *
  730          * Bugs:
  731          * 1. filesystems may end up trying to add an already existing entry
  732          * (for example this can happen after a cache miss during concurrent
  733          * lookup), in which case we will call cache_neg_evict despite not
  734          * adding anything.
  735          * 2. the routine may fail to free anything and no provisions are made
  736          * to make it try harder (see the inside for failure modes)
  737          * 3. it only ever looks at negative entries.
  738          */
  739         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
  740         if (cache_neg_evict_cond(lnumcache)) {
  741                 lnumcache = atomic_load_long(&numcache);
  742         }
  743         if (__predict_false(lnumcache >= ncsize)) {
  744                 atomic_subtract_long(&numcache, 1);
  745                 counter_u64_add(numdrops, 1);
  746                 return (NULL);
  747         }
  748         return (cache_alloc_uma(len, ts));
  749 }
  750 
  751 static void
  752 cache_free(struct namecache *ncp)
  753 {
  754 
  755         MPASS(ncp != NULL);
  756         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  757                 cache_drop_vnode(ncp->nc_dvp);
  758         }
  759         cache_free_uma(ncp);
  760         atomic_subtract_long(&numcache, 1);
  761 }
  762 
  763 static void
  764 cache_free_batch(struct cache_freebatch *batch)
  765 {
  766         struct namecache *ncp, *nnp;
  767         int i;
  768 
  769         i = 0;
  770         if (TAILQ_EMPTY(batch))
  771                 goto out;
  772         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
  773                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
  774                         cache_drop_vnode(ncp->nc_dvp);
  775                 }
  776                 cache_free_uma(ncp);
  777                 i++;
  778         }
  779         atomic_subtract_long(&numcache, i);
  780 out:
  781         SDT_PROBE1(vfs, namecache, purge, batch, i);
  782 }
  783 
  784 /*
  785  * Hashing.
  786  *
  787  * The code was made to use FNV in 2001 and this choice needs to be revisited.
  788  *
  789  * Short summary of the difficulty:
  790  * The longest name which can be inserted is NAME_MAX characters in length (or
  791  * 255 at the time of writing this comment), while majority of names used in
  792  * practice are significantly shorter (mostly below 10). More importantly
  793  * majority of lookups performed find names are even shorter than that.
  794  *
  795  * This poses a problem where hashes which do better than FNV past word size
  796  * (or so) tend to come with additional overhead when finalizing the result,
  797  * making them noticeably slower for the most commonly used range.
  798  *
  799  * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
  800  *
  801  * When looking it up the most time consuming part by a large margin (at least
  802  * on amd64) is hashing.  Replacing FNV with something which pessimizes short
  803  * input would make the slowest part stand out even more.
  804  */
  805 
  806 /*
  807  * TODO: With the value stored we can do better than computing the hash based
  808  * on the address.
  809  */
  810 static void
  811 cache_prehash(struct vnode *vp)
  812 {
  813 
  814         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
  815 }
  816 
  817 static uint32_t
  818 cache_get_hash(char *name, u_char len, struct vnode *dvp)
  819 {
  820 
  821         return (fnv_32_buf(name, len, dvp->v_nchash));
  822 }
  823 
  824 static uint32_t
  825 cache_get_hash_iter_start(struct vnode *dvp)
  826 {
  827 
  828         return (dvp->v_nchash);
  829 }
  830 
  831 static uint32_t
  832 cache_get_hash_iter(char c, uint32_t hash)
  833 {
  834 
  835         return (fnv_32_buf(&c, 1, hash));
  836 }
  837 
  838 static uint32_t
  839 cache_get_hash_iter_finish(uint32_t hash)
  840 {
  841 
  842         return (hash);
  843 }
  844 
  845 static inline struct nchashhead *
  846 NCP2BUCKET(struct namecache *ncp)
  847 {
  848         uint32_t hash;
  849 
  850         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  851         return (NCHHASH(hash));
  852 }
  853 
  854 static inline struct mtx *
  855 NCP2BUCKETLOCK(struct namecache *ncp)
  856 {
  857         uint32_t hash;
  858 
  859         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  860         return (HASH2BUCKETLOCK(hash));
  861 }
  862 
  863 #ifdef INVARIANTS
  864 static void
  865 cache_assert_bucket_locked(struct namecache *ncp)
  866 {
  867         struct mtx *blp;
  868 
  869         blp = NCP2BUCKETLOCK(ncp);
  870         mtx_assert(blp, MA_OWNED);
  871 }
  872 
  873 static void
  874 cache_assert_bucket_unlocked(struct namecache *ncp)
  875 {
  876         struct mtx *blp;
  877 
  878         blp = NCP2BUCKETLOCK(ncp);
  879         mtx_assert(blp, MA_NOTOWNED);
  880 }
  881 #else
  882 #define cache_assert_bucket_locked(x) do { } while (0)
  883 #define cache_assert_bucket_unlocked(x) do { } while (0)
  884 #endif
  885 
  886 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
  887 static void
  888 _cache_sort_vnodes(void **p1, void **p2)
  889 {
  890         void *tmp;
  891 
  892         MPASS(*p1 != NULL || *p2 != NULL);
  893 
  894         if (*p1 > *p2) {
  895                 tmp = *p2;
  896                 *p2 = *p1;
  897                 *p1 = tmp;
  898         }
  899 }
  900 
  901 static void
  902 cache_lock_all_buckets(void)
  903 {
  904         u_int i;
  905 
  906         for (i = 0; i < numbucketlocks; i++)
  907                 mtx_lock(&bucketlocks[i]);
  908 }
  909 
  910 static void
  911 cache_unlock_all_buckets(void)
  912 {
  913         u_int i;
  914 
  915         for (i = 0; i < numbucketlocks; i++)
  916                 mtx_unlock(&bucketlocks[i]);
  917 }
  918 
  919 static void
  920 cache_lock_all_vnodes(void)
  921 {
  922         u_int i;
  923 
  924         for (i = 0; i < numvnodelocks; i++)
  925                 mtx_lock(&vnodelocks[i]);
  926 }
  927 
  928 static void
  929 cache_unlock_all_vnodes(void)
  930 {
  931         u_int i;
  932 
  933         for (i = 0; i < numvnodelocks; i++)
  934                 mtx_unlock(&vnodelocks[i]);
  935 }
  936 
  937 static int
  938 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  939 {
  940 
  941         cache_sort_vnodes(&vlp1, &vlp2);
  942 
  943         if (vlp1 != NULL) {
  944                 if (!mtx_trylock(vlp1))
  945                         return (EAGAIN);
  946         }
  947         if (!mtx_trylock(vlp2)) {
  948                 if (vlp1 != NULL)
  949                         mtx_unlock(vlp1);
  950                 return (EAGAIN);
  951         }
  952 
  953         return (0);
  954 }
  955 
  956 static void
  957 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  958 {
  959 
  960         MPASS(vlp1 != NULL || vlp2 != NULL);
  961         MPASS(vlp1 <= vlp2);
  962 
  963         if (vlp1 != NULL)
  964                 mtx_lock(vlp1);
  965         if (vlp2 != NULL)
  966                 mtx_lock(vlp2);
  967 }
  968 
  969 static void
  970 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  971 {
  972 
  973         MPASS(vlp1 != NULL || vlp2 != NULL);
  974 
  975         if (vlp1 != NULL)
  976                 mtx_unlock(vlp1);
  977         if (vlp2 != NULL)
  978                 mtx_unlock(vlp2);
  979 }
  980 
  981 static int
  982 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  983 {
  984         struct nchstats snap;
  985 
  986         if (req->oldptr == NULL)
  987                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
  988 
  989         snap = nchstats;
  990         snap.ncs_goodhits = counter_u64_fetch(numposhits);
  991         snap.ncs_neghits = counter_u64_fetch(numneghits);
  992         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
  993             counter_u64_fetch(numnegzaps);
  994         snap.ncs_miss = counter_u64_fetch(nummisszap) +
  995             counter_u64_fetch(nummiss);
  996 
  997         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
  998 }
  999 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 1000     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 1001     "VFS cache effectiveness statistics");
 1002 
 1003 static void
 1004 cache_recalc_neg_min(u_int val)
 1005 {
 1006 
 1007         neg_min = (ncsize * val) / 100;
 1008 }
 1009 
 1010 static int
 1011 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 1012 {
 1013         u_int val;
 1014         int error;
 1015 
 1016         val = ncnegminpct;
 1017         error = sysctl_handle_int(oidp, &val, 0, req);
 1018         if (error != 0 || req->newptr == NULL)
 1019                 return (error);
 1020 
 1021         if (val == ncnegminpct)
 1022                 return (0);
 1023         if (val < 0 || val > 99)
 1024                 return (EINVAL);
 1025         ncnegminpct = val;
 1026         cache_recalc_neg_min(val);
 1027         return (0);
 1028 }
 1029 
 1030 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 1031     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 1032     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 1033 
 1034 #ifdef DIAGNOSTIC
 1035 /*
 1036  * Grab an atomic snapshot of the name cache hash chain lengths
 1037  */
 1038 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 1039     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 1040     "hash table stats");
 1041 
 1042 static int
 1043 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 1044 {
 1045         struct nchashhead *ncpp;
 1046         struct namecache *ncp;
 1047         int i, error, n_nchash, *cntbuf;
 1048 
 1049 retry:
 1050         n_nchash = nchash + 1;  /* nchash is max index, not count */
 1051         if (req->oldptr == NULL)
 1052                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 1053         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 1054         cache_lock_all_buckets();
 1055         if (n_nchash != nchash + 1) {
 1056                 cache_unlock_all_buckets();
 1057                 free(cntbuf, M_TEMP);
 1058                 goto retry;
 1059         }
 1060         /* Scan hash tables counting entries */
 1061         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 1062                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 1063                         cntbuf[i]++;
 1064         cache_unlock_all_buckets();
 1065         for (error = 0, i = 0; i < n_nchash; i++)
 1066                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 1067                         break;
 1068         free(cntbuf, M_TEMP);
 1069         return (error);
 1070 }
 1071 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 1072     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 1073     "nchash chain lengths");
 1074 
 1075 static int
 1076 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 1077 {
 1078         int error;
 1079         struct nchashhead *ncpp;
 1080         struct namecache *ncp;
 1081         int n_nchash;
 1082         int count, maxlength, used, pct;
 1083 
 1084         if (!req->oldptr)
 1085                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 1086 
 1087         cache_lock_all_buckets();
 1088         n_nchash = nchash + 1;  /* nchash is max index, not count */
 1089         used = 0;
 1090         maxlength = 0;
 1091 
 1092         /* Scan hash tables for applicable entries */
 1093         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 1094                 count = 0;
 1095                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 1096                         count++;
 1097                 }
 1098                 if (count)
 1099                         used++;
 1100                 if (maxlength < count)
 1101                         maxlength = count;
 1102         }
 1103         n_nchash = nchash + 1;
 1104         cache_unlock_all_buckets();
 1105         pct = (used * 100) / (n_nchash / 100);
 1106         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 1107         if (error)
 1108                 return (error);
 1109         error = SYSCTL_OUT(req, &used, sizeof(used));
 1110         if (error)
 1111                 return (error);
 1112         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 1113         if (error)
 1114                 return (error);
 1115         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 1116         if (error)
 1117                 return (error);
 1118         return (0);
 1119 }
 1120 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 1121     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 1122     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 1123 #endif
 1124 
 1125 /*
 1126  * Negative entries management
 1127  *
 1128  * Various workloads create plenty of negative entries and barely use them
 1129  * afterwards. Moreover malicious users can keep performing bogus lookups
 1130  * adding even more entries. For example "make tinderbox" as of writing this
 1131  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 1132  * negative.
 1133  *
 1134  * As such, a rather aggressive eviction method is needed. The currently
 1135  * employed method is a placeholder.
 1136  *
 1137  * Entries are split over numneglists separate lists, each of which is further
 1138  * split into hot and cold entries. Entries get promoted after getting a hit.
 1139  * Eviction happens on addition of new entry.
 1140  */
 1141 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1142     "Name cache negative entry statistics");
 1143 
 1144 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 1145     "Number of negative cache entries");
 1146 
 1147 static COUNTER_U64_DEFINE_EARLY(neg_created);
 1148 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 1149     "Number of created negative entries");
 1150 
 1151 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 1152 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 1153     "Number of evicted negative entries");
 1154 
 1155 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 1156 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 1157     &neg_evict_skipped_empty,
 1158     "Number of times evicting failed due to lack of entries");
 1159 
 1160 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 1161 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 1162     &neg_evict_skipped_missed,
 1163     "Number of times evicting failed due to target entry disappearing");
 1164 
 1165 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 1166 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 1167     &neg_evict_skipped_contended,
 1168     "Number of times evicting failed due to contention");
 1169 
 1170 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 1171     "Number of cache hits (negative)");
 1172 
 1173 static int
 1174 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 1175 {
 1176         int i, out;
 1177 
 1178         out = 0;
 1179         for (i = 0; i < numneglists; i++)
 1180                 out += neglists[i].nl_hotnum;
 1181 
 1182         return (SYSCTL_OUT(req, &out, sizeof(out)));
 1183 }
 1184 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 1185     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 1186     "Number of hot negative entries");
 1187 
 1188 static void
 1189 cache_neg_init(struct namecache *ncp)
 1190 {
 1191         struct negstate *ns;
 1192 
 1193         ncp->nc_flag |= NCF_NEGATIVE;
 1194         ns = NCP2NEGSTATE(ncp);
 1195         ns->neg_flag = 0;
 1196         ns->neg_hit = 0;
 1197         counter_u64_add(neg_created, 1);
 1198 }
 1199 
 1200 #define CACHE_NEG_PROMOTION_THRESH 2
 1201 
 1202 static bool
 1203 cache_neg_hit_prep(struct namecache *ncp)
 1204 {
 1205         struct negstate *ns;
 1206         u_char n;
 1207 
 1208         ns = NCP2NEGSTATE(ncp);
 1209         n = atomic_load_char(&ns->neg_hit);
 1210         for (;;) {
 1211                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 1212                         return (false);
 1213                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 1214                         break;
 1215         }
 1216         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 1217 }
 1218 
 1219 /*
 1220  * Nothing to do here but it is provided for completeness as some
 1221  * cache_neg_hit_prep callers may end up returning without even
 1222  * trying to promote.
 1223  */
 1224 #define cache_neg_hit_abort(ncp)        do { } while (0)
 1225 
 1226 static void
 1227 cache_neg_hit_finish(struct namecache *ncp)
 1228 {
 1229 
 1230         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 1231         counter_u64_add(numneghits, 1);
 1232 }
 1233 
 1234 /*
 1235  * Move a negative entry to the hot list.
 1236  */
 1237 static void
 1238 cache_neg_promote_locked(struct namecache *ncp)
 1239 {
 1240         struct neglist *nl;
 1241         struct negstate *ns;
 1242 
 1243         ns = NCP2NEGSTATE(ncp);
 1244         nl = NCP2NEGLIST(ncp);
 1245         mtx_assert(&nl->nl_lock, MA_OWNED);
 1246         if ((ns->neg_flag & NEG_HOT) == 0) {
 1247                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1248                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 1249                 nl->nl_hotnum++;
 1250                 ns->neg_flag |= NEG_HOT;
 1251         }
 1252 }
 1253 
 1254 /*
 1255  * Move a hot negative entry to the cold list.
 1256  */
 1257 static void
 1258 cache_neg_demote_locked(struct namecache *ncp)
 1259 {
 1260         struct neglist *nl;
 1261         struct negstate *ns;
 1262 
 1263         ns = NCP2NEGSTATE(ncp);
 1264         nl = NCP2NEGLIST(ncp);
 1265         mtx_assert(&nl->nl_lock, MA_OWNED);
 1266         MPASS(ns->neg_flag & NEG_HOT);
 1267         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1268         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1269         nl->nl_hotnum--;
 1270         ns->neg_flag &= ~NEG_HOT;
 1271         atomic_store_char(&ns->neg_hit, 0);
 1272 }
 1273 
 1274 /*
 1275  * Move a negative entry to the hot list if it matches the lookup.
 1276  *
 1277  * We have to take locks, but they may be contended and in the worst
 1278  * case we may need to go off CPU. We don't want to spin within the
 1279  * smr section and we can't block with it. Exiting the section means
 1280  * the found entry could have been evicted. We are going to look it
 1281  * up again.
 1282  */
 1283 static bool
 1284 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
 1285     struct namecache *oncp, uint32_t hash)
 1286 {
 1287         struct namecache *ncp;
 1288         struct neglist *nl;
 1289         u_char nc_flag;
 1290 
 1291         nl = NCP2NEGLIST(oncp);
 1292 
 1293         mtx_lock(&nl->nl_lock);
 1294         /*
 1295          * For hash iteration.
 1296          */
 1297         vfs_smr_enter();
 1298 
 1299         /*
 1300          * Avoid all surprises by only succeeding if we got the same entry and
 1301          * bailing completely otherwise.
 1302          * XXX There are no provisions to keep the vnode around, meaning we may
 1303          * end up promoting a negative entry for a *new* vnode and returning
 1304          * ENOENT on its account. This is the error we want to return anyway
 1305          * and promotion is harmless.
 1306          *
 1307          * In particular at this point there can be a new ncp which matches the
 1308          * search but hashes to a different neglist.
 1309          */
 1310         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1311                 if (ncp == oncp)
 1312                         break;
 1313         }
 1314 
 1315         /*
 1316          * No match to begin with.
 1317          */
 1318         if (__predict_false(ncp == NULL)) {
 1319                 goto out_abort;
 1320         }
 1321 
 1322         /*
 1323          * The newly found entry may be something different...
 1324          */
 1325         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1326             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
 1327                 goto out_abort;
 1328         }
 1329 
 1330         /*
 1331          * ... and not even negative.
 1332          */
 1333         nc_flag = atomic_load_char(&ncp->nc_flag);
 1334         if ((nc_flag & NCF_NEGATIVE) == 0) {
 1335                 goto out_abort;
 1336         }
 1337 
 1338         if (!cache_ncp_canuse(ncp)) {
 1339                 goto out_abort;
 1340         }
 1341 
 1342         cache_neg_promote_locked(ncp);
 1343         cache_neg_hit_finish(ncp);
 1344         vfs_smr_exit();
 1345         mtx_unlock(&nl->nl_lock);
 1346         return (true);
 1347 out_abort:
 1348         vfs_smr_exit();
 1349         mtx_unlock(&nl->nl_lock);
 1350         return (false);
 1351 }
 1352 
 1353 static void
 1354 cache_neg_promote(struct namecache *ncp)
 1355 {
 1356         struct neglist *nl;
 1357 
 1358         nl = NCP2NEGLIST(ncp);
 1359         mtx_lock(&nl->nl_lock);
 1360         cache_neg_promote_locked(ncp);
 1361         mtx_unlock(&nl->nl_lock);
 1362 }
 1363 
 1364 static void
 1365 cache_neg_insert(struct namecache *ncp)
 1366 {
 1367         struct neglist *nl;
 1368 
 1369         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1370         cache_assert_bucket_locked(ncp);
 1371         nl = NCP2NEGLIST(ncp);
 1372         mtx_lock(&nl->nl_lock);
 1373         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 1374         mtx_unlock(&nl->nl_lock);
 1375         atomic_add_long(&numneg, 1);
 1376 }
 1377 
 1378 static void
 1379 cache_neg_remove(struct namecache *ncp)
 1380 {
 1381         struct neglist *nl;
 1382         struct negstate *ns;
 1383 
 1384         cache_assert_bucket_locked(ncp);
 1385         nl = NCP2NEGLIST(ncp);
 1386         ns = NCP2NEGSTATE(ncp);
 1387         mtx_lock(&nl->nl_lock);
 1388         if ((ns->neg_flag & NEG_HOT) != 0) {
 1389                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 1390                 nl->nl_hotnum--;
 1391         } else {
 1392                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 1393         }
 1394         mtx_unlock(&nl->nl_lock);
 1395         atomic_subtract_long(&numneg, 1);
 1396 }
 1397 
 1398 static struct neglist *
 1399 cache_neg_evict_select_list(void)
 1400 {
 1401         struct neglist *nl;
 1402         u_int c;
 1403 
 1404         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
 1405         nl = &neglists[c % numneglists];
 1406         if (!mtx_trylock(&nl->nl_evict_lock)) {
 1407                 counter_u64_add(neg_evict_skipped_contended, 1);
 1408                 return (NULL);
 1409         }
 1410         return (nl);
 1411 }
 1412 
 1413 static struct namecache *
 1414 cache_neg_evict_select_entry(struct neglist *nl)
 1415 {
 1416         struct namecache *ncp, *lncp;
 1417         struct negstate *ns, *lns;
 1418         int i;
 1419 
 1420         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
 1421         mtx_assert(&nl->nl_lock, MA_OWNED);
 1422         ncp = TAILQ_FIRST(&nl->nl_list);
 1423         if (ncp == NULL)
 1424                 return (NULL);
 1425         lncp = ncp;
 1426         lns = NCP2NEGSTATE(lncp);
 1427         for (i = 1; i < 4; i++) {
 1428                 ncp = TAILQ_NEXT(ncp, nc_dst);
 1429                 if (ncp == NULL)
 1430                         break;
 1431                 ns = NCP2NEGSTATE(ncp);
 1432                 if (ns->neg_hit < lns->neg_hit) {
 1433                         lncp = ncp;
 1434                         lns = ns;
 1435                 }
 1436         }
 1437         return (lncp);
 1438 }
 1439 
 1440 static bool
 1441 cache_neg_evict(void)
 1442 {
 1443         struct namecache *ncp, *ncp2;
 1444         struct neglist *nl;
 1445         struct vnode *dvp;
 1446         struct mtx *dvlp;
 1447         struct mtx *blp;
 1448         uint32_t hash;
 1449         u_char nlen;
 1450         bool evicted;
 1451 
 1452         nl = cache_neg_evict_select_list();
 1453         if (nl == NULL) {
 1454                 return (false);
 1455         }
 1456 
 1457         mtx_lock(&nl->nl_lock);
 1458         ncp = TAILQ_FIRST(&nl->nl_hotlist);
 1459         if (ncp != NULL) {
 1460                 cache_neg_demote_locked(ncp);
 1461         }
 1462         ncp = cache_neg_evict_select_entry(nl);
 1463         if (ncp == NULL) {
 1464                 counter_u64_add(neg_evict_skipped_empty, 1);
 1465                 mtx_unlock(&nl->nl_lock);
 1466                 mtx_unlock(&nl->nl_evict_lock);
 1467                 return (false);
 1468         }
 1469         nlen = ncp->nc_nlen;
 1470         dvp = ncp->nc_dvp;
 1471         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
 1472         dvlp = VP2VNODELOCK(dvp);
 1473         blp = HASH2BUCKETLOCK(hash);
 1474         mtx_unlock(&nl->nl_lock);
 1475         mtx_unlock(&nl->nl_evict_lock);
 1476         mtx_lock(dvlp);
 1477         mtx_lock(blp);
 1478         /*
 1479          * Note that since all locks were dropped above, the entry may be
 1480          * gone or reallocated to be something else.
 1481          */
 1482         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
 1483                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
 1484                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
 1485                         break;
 1486         }
 1487         if (ncp2 == NULL) {
 1488                 counter_u64_add(neg_evict_skipped_missed, 1);
 1489                 ncp = NULL;
 1490                 evicted = false;
 1491         } else {
 1492                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
 1493                 MPASS(blp == NCP2BUCKETLOCK(ncp));
 1494                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
 1495                     ncp->nc_name);
 1496                 cache_zap_locked(ncp);
 1497                 counter_u64_add(neg_evicted, 1);
 1498                 evicted = true;
 1499         }
 1500         mtx_unlock(blp);
 1501         mtx_unlock(dvlp);
 1502         if (ncp != NULL)
 1503                 cache_free(ncp);
 1504         return (evicted);
 1505 }
 1506 
 1507 /*
 1508  * Maybe evict a negative entry to create more room.
 1509  *
 1510  * The ncnegfactor parameter limits what fraction of the total count
 1511  * can comprise of negative entries. However, if the cache is just
 1512  * warming up this leads to excessive evictions.  As such, ncnegminpct
 1513  * (recomputed to neg_min) dictates whether the above should be
 1514  * applied.
 1515  *
 1516  * Try evicting if the cache is close to full capacity regardless of
 1517  * other considerations.
 1518  */
 1519 static bool
 1520 cache_neg_evict_cond(u_long lnumcache)
 1521 {
 1522         u_long lnumneg;
 1523 
 1524         if (ncsize - 1000 < lnumcache)
 1525                 goto out_evict;
 1526         lnumneg = atomic_load_long(&numneg);
 1527         if (lnumneg < neg_min)
 1528                 return (false);
 1529         if (lnumneg * ncnegfactor < lnumcache)
 1530                 return (false);
 1531 out_evict:
 1532         return (cache_neg_evict());
 1533 }
 1534 
 1535 /*
 1536  * cache_zap_locked():
 1537  *
 1538  *   Removes a namecache entry from cache, whether it contains an actual
 1539  *   pointer to a vnode or if it is just a negative cache entry.
 1540  */
 1541 static void
 1542 cache_zap_locked(struct namecache *ncp)
 1543 {
 1544         struct nchashhead *ncpp;
 1545         struct vnode *dvp, *vp;
 1546 
 1547         dvp = ncp->nc_dvp;
 1548         vp = ncp->nc_vp;
 1549 
 1550         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1551                 cache_assert_vnode_locked(vp);
 1552         cache_assert_vnode_locked(dvp);
 1553         cache_assert_bucket_locked(ncp);
 1554 
 1555         cache_ncp_invalidate(ncp);
 1556 
 1557         ncpp = NCP2BUCKET(ncp);
 1558         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 1559         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1560                 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
 1561                 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
 1562                 if (ncp == vp->v_cache_dd) {
 1563                         atomic_store_ptr(&vp->v_cache_dd, NULL);
 1564                 }
 1565         } else {
 1566                 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
 1567                 cache_neg_remove(ncp);
 1568         }
 1569         if (ncp->nc_flag & NCF_ISDOTDOT) {
 1570                 if (ncp == dvp->v_cache_dd) {
 1571                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1572                 }
 1573         } else {
 1574                 LIST_REMOVE(ncp, nc_src);
 1575                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 1576                         ncp->nc_flag |= NCF_DVDROP;
 1577                 }
 1578         }
 1579 }
 1580 
 1581 static void
 1582 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 1583 {
 1584         struct mtx *blp;
 1585 
 1586         MPASS(ncp->nc_dvp == vp);
 1587         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 1588         cache_assert_vnode_locked(vp);
 1589 
 1590         blp = NCP2BUCKETLOCK(ncp);
 1591         mtx_lock(blp);
 1592         cache_zap_locked(ncp);
 1593         mtx_unlock(blp);
 1594 }
 1595 
 1596 static bool
 1597 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
 1598     struct mtx **vlpp)
 1599 {
 1600         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 1601         struct mtx *blp;
 1602 
 1603         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 1604         cache_assert_vnode_locked(vp);
 1605 
 1606         if (ncp->nc_flag & NCF_NEGATIVE) {
 1607                 if (*vlpp != NULL) {
 1608                         mtx_unlock(*vlpp);
 1609                         *vlpp = NULL;
 1610                 }
 1611                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 1612                 return (true);
 1613         }
 1614 
 1615         pvlp = VP2VNODELOCK(vp);
 1616         blp = NCP2BUCKETLOCK(ncp);
 1617         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
 1618         vlp2 = VP2VNODELOCK(ncp->nc_vp);
 1619 
 1620         if (*vlpp == vlp1 || *vlpp == vlp2) {
 1621                 to_unlock = *vlpp;
 1622                 *vlpp = NULL;
 1623         } else {
 1624                 if (*vlpp != NULL) {
 1625                         mtx_unlock(*vlpp);
 1626                         *vlpp = NULL;
 1627                 }
 1628                 cache_sort_vnodes(&vlp1, &vlp2);
 1629                 if (vlp1 == pvlp) {
 1630                         mtx_lock(vlp2);
 1631                         to_unlock = vlp2;
 1632                 } else {
 1633                         if (!mtx_trylock(vlp1))
 1634                                 goto out_relock;
 1635                         to_unlock = vlp1;
 1636                 }
 1637         }
 1638         mtx_lock(blp);
 1639         cache_zap_locked(ncp);
 1640         mtx_unlock(blp);
 1641         if (to_unlock != NULL)
 1642                 mtx_unlock(to_unlock);
 1643         return (true);
 1644 
 1645 out_relock:
 1646         mtx_unlock(vlp2);
 1647         mtx_lock(vlp1);
 1648         mtx_lock(vlp2);
 1649         MPASS(*vlpp == NULL);
 1650         *vlpp = vlp1;
 1651         return (false);
 1652 }
 1653 
 1654 /*
 1655  * If trylocking failed we can get here. We know enough to take all needed locks
 1656  * in the right order and re-lookup the entry.
 1657  */
 1658 static int
 1659 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
 1660     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
 1661     struct mtx *blp)
 1662 {
 1663         struct namecache *rncp;
 1664 
 1665         cache_assert_bucket_unlocked(ncp);
 1666 
 1667         cache_sort_vnodes(&dvlp, &vlp);
 1668         cache_lock_vnodes(dvlp, vlp);
 1669         mtx_lock(blp);
 1670         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 1671                 if (rncp == ncp && rncp->nc_dvp == dvp &&
 1672                     rncp->nc_nlen == cnp->cn_namelen &&
 1673                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
 1674                         break;
 1675         }
 1676         if (rncp != NULL) {
 1677                 cache_zap_locked(rncp);
 1678                 mtx_unlock(blp);
 1679                 cache_unlock_vnodes(dvlp, vlp);
 1680                 counter_u64_add(zap_bucket_relock_success, 1);
 1681                 return (0);
 1682         }
 1683 
 1684         mtx_unlock(blp);
 1685         cache_unlock_vnodes(dvlp, vlp);
 1686         return (EAGAIN);
 1687 }
 1688 
 1689 static int __noinline
 1690 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
 1691     uint32_t hash, struct mtx *blp)
 1692 {
 1693         struct mtx *dvlp, *vlp;
 1694         struct vnode *dvp;
 1695 
 1696         cache_assert_bucket_locked(ncp);
 1697 
 1698         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1699         vlp = NULL;
 1700         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1701                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1702         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1703                 cache_zap_locked(ncp);
 1704                 mtx_unlock(blp);
 1705                 cache_unlock_vnodes(dvlp, vlp);
 1706                 return (0);
 1707         }
 1708 
 1709         dvp = ncp->nc_dvp;
 1710         mtx_unlock(blp);
 1711         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 1712 }
 1713 
 1714 static __noinline int
 1715 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
 1716 {
 1717         struct namecache *ncp;
 1718         struct mtx *blp;
 1719         struct mtx *dvlp, *dvlp2;
 1720         uint32_t hash;
 1721         int error;
 1722 
 1723         if (cnp->cn_namelen == 2 &&
 1724             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1725                 dvlp = VP2VNODELOCK(dvp);
 1726                 dvlp2 = NULL;
 1727                 mtx_lock(dvlp);
 1728 retry_dotdot:
 1729                 ncp = dvp->v_cache_dd;
 1730                 if (ncp == NULL) {
 1731                         mtx_unlock(dvlp);
 1732                         if (dvlp2 != NULL)
 1733                                 mtx_unlock(dvlp2);
 1734                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1735                         return (0);
 1736                 }
 1737                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1738                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
 1739                                 goto retry_dotdot;
 1740                         MPASS(dvp->v_cache_dd == NULL);
 1741                         mtx_unlock(dvlp);
 1742                         if (dvlp2 != NULL)
 1743                                 mtx_unlock(dvlp2);
 1744                         cache_free(ncp);
 1745                 } else {
 1746                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 1747                         mtx_unlock(dvlp);
 1748                         if (dvlp2 != NULL)
 1749                                 mtx_unlock(dvlp2);
 1750                 }
 1751                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1752                 return (1);
 1753         }
 1754 
 1755         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1756         blp = HASH2BUCKETLOCK(hash);
 1757 retry:
 1758         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 1759                 goto out_no_entry;
 1760 
 1761         mtx_lock(blp);
 1762 
 1763         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1764                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1765                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1766                         break;
 1767         }
 1768 
 1769         if (ncp == NULL) {
 1770                 mtx_unlock(blp);
 1771                 goto out_no_entry;
 1772         }
 1773 
 1774         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 1775         if (__predict_false(error != 0)) {
 1776                 zap_bucket_fail++;
 1777                 goto retry;
 1778         }
 1779         counter_u64_add(numposzaps, 1);
 1780         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 1781         cache_free(ncp);
 1782         return (1);
 1783 out_no_entry:
 1784         counter_u64_add(nummisszap, 1);
 1785         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 1786         return (0);
 1787 }
 1788 
 1789 static int __noinline
 1790 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1791     struct timespec *tsp, int *ticksp)
 1792 {
 1793         int ltype;
 1794 
 1795         *vpp = dvp;
 1796         counter_u64_add(dothits, 1);
 1797         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 1798         if (tsp != NULL)
 1799                 timespecclear(tsp);
 1800         if (ticksp != NULL)
 1801                 *ticksp = ticks;
 1802         vrefact(*vpp);
 1803         /*
 1804          * When we lookup "." we still can be asked to lock it
 1805          * differently...
 1806          */
 1807         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 1808         if (ltype != VOP_ISLOCKED(*vpp)) {
 1809                 if (ltype == LK_EXCLUSIVE) {
 1810                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 1811                         if (VN_IS_DOOMED((*vpp))) {
 1812                                 /* forced unmount */
 1813                                 vrele(*vpp);
 1814                                 *vpp = NULL;
 1815                                 return (ENOENT);
 1816                         }
 1817                 } else
 1818                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 1819         }
 1820         return (-1);
 1821 }
 1822 
 1823 static int __noinline
 1824 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1825     struct timespec *tsp, int *ticksp)
 1826 {
 1827         struct namecache_ts *ncp_ts;
 1828         struct namecache *ncp;
 1829         struct mtx *dvlp;
 1830         enum vgetstate vs;
 1831         int error, ltype;
 1832         bool whiteout;
 1833 
 1834         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
 1835 
 1836         if ((cnp->cn_flags & MAKEENTRY) == 0) {
 1837                 cache_remove_cnp(dvp, cnp);
 1838                 return (0);
 1839         }
 1840 
 1841         counter_u64_add(dotdothits, 1);
 1842 retry:
 1843         dvlp = VP2VNODELOCK(dvp);
 1844         mtx_lock(dvlp);
 1845         ncp = dvp->v_cache_dd;
 1846         if (ncp == NULL) {
 1847                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
 1848                 mtx_unlock(dvlp);
 1849                 return (0);
 1850         }
 1851         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1852                 if (ncp->nc_flag & NCF_NEGATIVE)
 1853                         *vpp = NULL;
 1854                 else
 1855                         *vpp = ncp->nc_vp;
 1856         } else
 1857                 *vpp = ncp->nc_dvp;
 1858         if (*vpp == NULL)
 1859                 goto negative_success;
 1860         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
 1861         cache_out_ts(ncp, tsp, ticksp);
 1862         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 1863             NCF_DTS && tsp != NULL) {
 1864                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1865                 *tsp = ncp_ts->nc_dotdottime;
 1866         }
 1867 
 1868         MPASS(dvp != *vpp);
 1869         ltype = VOP_ISLOCKED(dvp);
 1870         VOP_UNLOCK(dvp);
 1871         vs = vget_prep(*vpp);
 1872         mtx_unlock(dvlp);
 1873         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1874         vn_lock(dvp, ltype | LK_RETRY);
 1875         if (VN_IS_DOOMED(dvp)) {
 1876                 if (error == 0)
 1877                         vput(*vpp);
 1878                 *vpp = NULL;
 1879                 return (ENOENT);
 1880         }
 1881         if (error) {
 1882                 *vpp = NULL;
 1883                 goto retry;
 1884         }
 1885         return (-1);
 1886 negative_success:
 1887         if (__predict_false(cnp->cn_nameiop == CREATE)) {
 1888                 if (cnp->cn_flags & ISLASTCN) {
 1889                         counter_u64_add(numnegzaps, 1);
 1890                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
 1891                         mtx_unlock(dvlp);
 1892                         cache_free(ncp);
 1893                         return (0);
 1894                 }
 1895         }
 1896 
 1897         whiteout = (ncp->nc_flag & NCF_WHITE);
 1898         cache_out_ts(ncp, tsp, ticksp);
 1899         if (cache_neg_hit_prep(ncp))
 1900                 cache_neg_promote(ncp);
 1901         else
 1902                 cache_neg_hit_finish(ncp);
 1903         mtx_unlock(dvlp);
 1904         if (whiteout)
 1905                 cnp->cn_flags |= ISWHITEOUT;
 1906         return (ENOENT);
 1907 }
 1908 
 1909 /**
 1910  * Lookup a name in the name cache
 1911  *
 1912  * # Arguments
 1913  *
 1914  * - dvp:       Parent directory in which to search.
 1915  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
 1916  * - cnp:       Parameters of the name search.  The most interesting bits of
 1917  *              the cn_flags field have the following meanings:
 1918  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
 1919  *                      it up.
 1920  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
 1921  * - tsp:       Return storage for cache timestamp.  On a successful (positive
 1922  *              or negative) lookup, tsp will be filled with any timespec that
 1923  *              was stored when this cache entry was created.  However, it will
 1924  *              be clear for "." entries.
 1925  * - ticks:     Return storage for alternate cache timestamp.  On a successful
 1926  *              (positive or negative) lookup, it will contain the ticks value
 1927  *              that was current when the cache entry was created, unless cnp
 1928  *              was ".".
 1929  *
 1930  * Either both tsp and ticks have to be provided or neither of them.
 1931  *
 1932  * # Returns
 1933  *
 1934  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
 1935  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
 1936  *              to a forced unmount.  vpp will not be modified.  If the entry
 1937  *              is a whiteout, then the ISWHITEOUT flag will be set in
 1938  *              cnp->cn_flags.
 1939  * - 0:         A cache miss.  vpp will not be modified.
 1940  *
 1941  * # Locking
 1942  *
 1943  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
 1944  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
 1945  * lock is not recursively acquired.
 1946  */
 1947 static int __noinline
 1948 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1949     struct timespec *tsp, int *ticksp)
 1950 {
 1951         struct namecache *ncp;
 1952         struct mtx *blp;
 1953         uint32_t hash;
 1954         enum vgetstate vs;
 1955         int error;
 1956         bool whiteout;
 1957 
 1958         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 1959         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
 1960 
 1961 retry:
 1962         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1963         blp = HASH2BUCKETLOCK(hash);
 1964         mtx_lock(blp);
 1965 
 1966         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1967                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1968                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1969                         break;
 1970         }
 1971 
 1972         if (__predict_false(ncp == NULL)) {
 1973                 mtx_unlock(blp);
 1974                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 1975                 counter_u64_add(nummiss, 1);
 1976                 return (0);
 1977         }
 1978 
 1979         if (ncp->nc_flag & NCF_NEGATIVE)
 1980                 goto negative_success;
 1981 
 1982         counter_u64_add(numposhits, 1);
 1983         *vpp = ncp->nc_vp;
 1984         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 1985         cache_out_ts(ncp, tsp, ticksp);
 1986         MPASS(dvp != *vpp);
 1987         vs = vget_prep(*vpp);
 1988         mtx_unlock(blp);
 1989         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 1990         if (error) {
 1991                 *vpp = NULL;
 1992                 goto retry;
 1993         }
 1994         return (-1);
 1995 negative_success:
 1996         /*
 1997          * We don't get here with regular lookup apart from corner cases.
 1998          */
 1999         if (__predict_true(cnp->cn_nameiop == CREATE)) {
 2000                 if (cnp->cn_flags & ISLASTCN) {
 2001                         counter_u64_add(numnegzaps, 1);
 2002                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 2003                         if (__predict_false(error != 0)) {
 2004                                 zap_bucket_fail2++;
 2005                                 goto retry;
 2006                         }
 2007                         cache_free(ncp);
 2008                         return (0);
 2009                 }
 2010         }
 2011 
 2012         whiteout = (ncp->nc_flag & NCF_WHITE);
 2013         cache_out_ts(ncp, tsp, ticksp);
 2014         if (cache_neg_hit_prep(ncp))
 2015                 cache_neg_promote(ncp);
 2016         else
 2017                 cache_neg_hit_finish(ncp);
 2018         mtx_unlock(blp);
 2019         if (whiteout)
 2020                 cnp->cn_flags |= ISWHITEOUT;
 2021         return (ENOENT);
 2022 }
 2023 
 2024 int
 2025 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 2026     struct timespec *tsp, int *ticksp)
 2027 {
 2028         struct namecache *ncp;
 2029         uint32_t hash;
 2030         enum vgetstate vs;
 2031         int error;
 2032         bool whiteout, neg_promote;
 2033         u_short nc_flag;
 2034 
 2035         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
 2036 
 2037 #ifdef DEBUG_CACHE
 2038         if (__predict_false(!doingcache)) {
 2039                 cnp->cn_flags &= ~MAKEENTRY;
 2040                 return (0);
 2041         }
 2042 #endif
 2043 
 2044         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 2045                 if (cnp->cn_namelen == 1)
 2046                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 2047                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
 2048                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
 2049         }
 2050 
 2051         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 2052 
 2053         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
 2054                 cache_remove_cnp(dvp, cnp);
 2055                 return (0);
 2056         }
 2057 
 2058         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 2059         vfs_smr_enter();
 2060 
 2061         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 2062                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 2063                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 2064                         break;
 2065         }
 2066 
 2067         if (__predict_false(ncp == NULL)) {
 2068                 vfs_smr_exit();
 2069                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 2070                 counter_u64_add(nummiss, 1);
 2071                 return (0);
 2072         }
 2073 
 2074         nc_flag = atomic_load_char(&ncp->nc_flag);
 2075         if (nc_flag & NCF_NEGATIVE)
 2076                 goto negative_success;
 2077 
 2078         counter_u64_add(numposhits, 1);
 2079         *vpp = ncp->nc_vp;
 2080         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 2081         cache_out_ts(ncp, tsp, ticksp);
 2082         MPASS(dvp != *vpp);
 2083         if (!cache_ncp_canuse(ncp)) {
 2084                 vfs_smr_exit();
 2085                 *vpp = NULL;
 2086                 goto out_fallback;
 2087         }
 2088         vs = vget_prep_smr(*vpp);
 2089         vfs_smr_exit();
 2090         if (__predict_false(vs == VGET_NONE)) {
 2091                 *vpp = NULL;
 2092                 goto out_fallback;
 2093         }
 2094         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 2095         if (error) {
 2096                 *vpp = NULL;
 2097                 goto out_fallback;
 2098         }
 2099         return (-1);
 2100 negative_success:
 2101         if (cnp->cn_nameiop == CREATE) {
 2102                 if (cnp->cn_flags & ISLASTCN) {
 2103                         vfs_smr_exit();
 2104                         goto out_fallback;
 2105                 }
 2106         }
 2107 
 2108         cache_out_ts(ncp, tsp, ticksp);
 2109         whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
 2110         neg_promote = cache_neg_hit_prep(ncp);
 2111         if (!cache_ncp_canuse(ncp)) {
 2112                 cache_neg_hit_abort(ncp);
 2113                 vfs_smr_exit();
 2114                 goto out_fallback;
 2115         }
 2116         if (neg_promote) {
 2117                 vfs_smr_exit();
 2118                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
 2119                         goto out_fallback;
 2120         } else {
 2121                 cache_neg_hit_finish(ncp);
 2122                 vfs_smr_exit();
 2123         }
 2124         if (whiteout)
 2125                 cnp->cn_flags |= ISWHITEOUT;
 2126         return (ENOENT);
 2127 out_fallback:
 2128         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
 2129 }
 2130 
 2131 struct celockstate {
 2132         struct mtx *vlp[3];
 2133         struct mtx *blp[2];
 2134 };
 2135 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 2136 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 2137 
 2138 static inline void
 2139 cache_celockstate_init(struct celockstate *cel)
 2140 {
 2141 
 2142         bzero(cel, sizeof(*cel));
 2143 }
 2144 
 2145 static void
 2146 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
 2147     struct vnode *dvp)
 2148 {
 2149         struct mtx *vlp1, *vlp2;
 2150 
 2151         MPASS(cel->vlp[0] == NULL);
 2152         MPASS(cel->vlp[1] == NULL);
 2153         MPASS(cel->vlp[2] == NULL);
 2154 
 2155         MPASS(vp != NULL || dvp != NULL);
 2156 
 2157         vlp1 = VP2VNODELOCK(vp);
 2158         vlp2 = VP2VNODELOCK(dvp);
 2159         cache_sort_vnodes(&vlp1, &vlp2);
 2160 
 2161         if (vlp1 != NULL) {
 2162                 mtx_lock(vlp1);
 2163                 cel->vlp[0] = vlp1;
 2164         }
 2165         mtx_lock(vlp2);
 2166         cel->vlp[1] = vlp2;
 2167 }
 2168 
 2169 static void
 2170 cache_unlock_vnodes_cel(struct celockstate *cel)
 2171 {
 2172 
 2173         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 2174 
 2175         if (cel->vlp[0] != NULL)
 2176                 mtx_unlock(cel->vlp[0]);
 2177         if (cel->vlp[1] != NULL)
 2178                 mtx_unlock(cel->vlp[1]);
 2179         if (cel->vlp[2] != NULL)
 2180                 mtx_unlock(cel->vlp[2]);
 2181 }
 2182 
 2183 static bool
 2184 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 2185 {
 2186         struct mtx *vlp;
 2187         bool ret;
 2188 
 2189         cache_assert_vlp_locked(cel->vlp[0]);
 2190         cache_assert_vlp_locked(cel->vlp[1]);
 2191         MPASS(cel->vlp[2] == NULL);
 2192 
 2193         MPASS(vp != NULL);
 2194         vlp = VP2VNODELOCK(vp);
 2195 
 2196         ret = true;
 2197         if (vlp >= cel->vlp[1]) {
 2198                 mtx_lock(vlp);
 2199         } else {
 2200                 if (mtx_trylock(vlp))
 2201                         goto out;
 2202                 cache_lock_vnodes_cel_3_failures++;
 2203                 cache_unlock_vnodes_cel(cel);
 2204                 if (vlp < cel->vlp[0]) {
 2205                         mtx_lock(vlp);
 2206                         mtx_lock(cel->vlp[0]);
 2207                         mtx_lock(cel->vlp[1]);
 2208                 } else {
 2209                         if (cel->vlp[0] != NULL)
 2210                                 mtx_lock(cel->vlp[0]);
 2211                         mtx_lock(vlp);
 2212                         mtx_lock(cel->vlp[1]);
 2213                 }
 2214                 ret = false;
 2215         }
 2216 out:
 2217         cel->vlp[2] = vlp;
 2218         return (ret);
 2219 }
 2220 
 2221 static void
 2222 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
 2223     struct mtx *blp2)
 2224 {
 2225 
 2226         MPASS(cel->blp[0] == NULL);
 2227         MPASS(cel->blp[1] == NULL);
 2228 
 2229         cache_sort_vnodes(&blp1, &blp2);
 2230 
 2231         if (blp1 != NULL) {
 2232                 mtx_lock(blp1);
 2233                 cel->blp[0] = blp1;
 2234         }
 2235         mtx_lock(blp2);
 2236         cel->blp[1] = blp2;
 2237 }
 2238 
 2239 static void
 2240 cache_unlock_buckets_cel(struct celockstate *cel)
 2241 {
 2242 
 2243         if (cel->blp[0] != NULL)
 2244                 mtx_unlock(cel->blp[0]);
 2245         mtx_unlock(cel->blp[1]);
 2246 }
 2247 
 2248 /*
 2249  * Lock part of the cache affected by the insertion.
 2250  *
 2251  * This means vnodelocks for dvp, vp and the relevant bucketlock.
 2252  * However, insertion can result in removal of an old entry. In this
 2253  * case we have an additional vnode and bucketlock pair to lock.
 2254  *
 2255  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
 2256  * preserving the locking order (smaller address first).
 2257  */
 2258 static void
 2259 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2260     uint32_t hash)
 2261 {
 2262         struct namecache *ncp;
 2263         struct mtx *blps[2];
 2264         u_char nc_flag;
 2265 
 2266         blps[0] = HASH2BUCKETLOCK(hash);
 2267         for (;;) {
 2268                 blps[1] = NULL;
 2269                 cache_lock_vnodes_cel(cel, dvp, vp);
 2270                 if (vp == NULL || vp->v_type != VDIR)
 2271                         break;
 2272                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 2273                 if (ncp == NULL)
 2274                         break;
 2275                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2276                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2277                         break;
 2278                 MPASS(ncp->nc_dvp == vp);
 2279                 blps[1] = NCP2BUCKETLOCK(ncp);
 2280                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2281                         break;
 2282                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2283                         break;
 2284                 /*
 2285                  * All vnodes got re-locked. Re-validate the state and if
 2286                  * nothing changed we are done. Otherwise restart.
 2287                  */
 2288                 if (ncp == vp->v_cache_dd &&
 2289                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2290                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2291                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2292                         break;
 2293                 cache_unlock_vnodes_cel(cel);
 2294                 cel->vlp[0] = NULL;
 2295                 cel->vlp[1] = NULL;
 2296                 cel->vlp[2] = NULL;
 2297         }
 2298         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2299 }
 2300 
 2301 static void
 2302 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 2303     uint32_t hash)
 2304 {
 2305         struct namecache *ncp;
 2306         struct mtx *blps[2];
 2307         u_char nc_flag;
 2308 
 2309         blps[0] = HASH2BUCKETLOCK(hash);
 2310         for (;;) {
 2311                 blps[1] = NULL;
 2312                 cache_lock_vnodes_cel(cel, dvp, vp);
 2313                 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 2314                 if (ncp == NULL)
 2315                         break;
 2316                 nc_flag = atomic_load_char(&ncp->nc_flag);
 2317                 if ((nc_flag & NCF_ISDOTDOT) == 0)
 2318                         break;
 2319                 MPASS(ncp->nc_dvp == dvp);
 2320                 blps[1] = NCP2BUCKETLOCK(ncp);
 2321                 if ((nc_flag & NCF_NEGATIVE) != 0)
 2322                         break;
 2323                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 2324                         break;
 2325                 if (ncp == dvp->v_cache_dd &&
 2326                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 2327                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 2328                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 2329                         break;
 2330                 cache_unlock_vnodes_cel(cel);
 2331                 cel->vlp[0] = NULL;
 2332                 cel->vlp[1] = NULL;
 2333                 cel->vlp[2] = NULL;
 2334         }
 2335         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 2336 }
 2337 
 2338 static void
 2339 cache_enter_unlock(struct celockstate *cel)
 2340 {
 2341 
 2342         cache_unlock_buckets_cel(cel);
 2343         cache_unlock_vnodes_cel(cel);
 2344 }
 2345 
 2346 static void __noinline
 2347 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
 2348     struct componentname *cnp)
 2349 {
 2350         struct celockstate cel;
 2351         struct namecache *ncp;
 2352         uint32_t hash;
 2353         int len;
 2354 
 2355         if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
 2356                 return;
 2357         len = cnp->cn_namelen;
 2358         cache_celockstate_init(&cel);
 2359         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2360         cache_enter_lock_dd(&cel, dvp, vp, hash);
 2361         ncp = dvp->v_cache_dd;
 2362         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
 2363                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
 2364                 cache_zap_locked(ncp);
 2365         } else {
 2366                 ncp = NULL;
 2367         }
 2368         atomic_store_ptr(&dvp->v_cache_dd, NULL);
 2369         cache_enter_unlock(&cel);
 2370         if (ncp != NULL)
 2371                 cache_free(ncp);
 2372 }
 2373 
 2374 /*
 2375  * Add an entry to the cache.
 2376  */
 2377 void
 2378 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 2379     struct timespec *tsp, struct timespec *dtsp)
 2380 {
 2381         struct celockstate cel;
 2382         struct namecache *ncp, *n2, *ndd;
 2383         struct namecache_ts *ncp_ts;
 2384         struct nchashhead *ncpp;
 2385         uint32_t hash;
 2386         int flag;
 2387         int len;
 2388 
 2389         KASSERT(cnp->cn_namelen <= NAME_MAX,
 2390             ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
 2391             NAME_MAX));
 2392 #ifdef notyet
 2393         /*
 2394          * Not everything doing this is weeded out yet.
 2395          */
 2396         VNPASS(dvp != vp, dvp);
 2397 #endif
 2398         VNPASS(!VN_IS_DOOMED(dvp), dvp);
 2399         VNPASS(dvp->v_type != VNON, dvp);
 2400         if (vp != NULL) {
 2401                 VNPASS(!VN_IS_DOOMED(vp), vp);
 2402                 VNPASS(vp->v_type != VNON, vp);
 2403         }
 2404 
 2405 #ifdef DEBUG_CACHE
 2406         if (__predict_false(!doingcache))
 2407                 return;
 2408 #endif
 2409 
 2410         flag = 0;
 2411         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 2412                 if (cnp->cn_namelen == 1)
 2413                         return;
 2414                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 2415                         cache_enter_dotdot_prep(dvp, vp, cnp);
 2416                         flag = NCF_ISDOTDOT;
 2417                 }
 2418         }
 2419 
 2420         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 2421         if (ncp == NULL)
 2422                 return;
 2423 
 2424         cache_celockstate_init(&cel);
 2425         ndd = NULL;
 2426         ncp_ts = NULL;
 2427 
 2428         /*
 2429          * Calculate the hash key and setup as much of the new
 2430          * namecache entry as possible before acquiring the lock.
 2431          */
 2432         ncp->nc_flag = flag | NCF_WIP;
 2433         ncp->nc_vp = vp;
 2434         if (vp == NULL)
 2435                 cache_neg_init(ncp);
 2436         ncp->nc_dvp = dvp;
 2437         if (tsp != NULL) {
 2438                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 2439                 ncp_ts->nc_time = *tsp;
 2440                 ncp_ts->nc_ticks = ticks;
 2441                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
 2442                 if (dtsp != NULL) {
 2443                         ncp_ts->nc_dotdottime = *dtsp;
 2444                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 2445                 }
 2446         }
 2447         len = ncp->nc_nlen = cnp->cn_namelen;
 2448         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 2449         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
 2450         ncp->nc_name[len] = '\0';
 2451         cache_enter_lock(&cel, dvp, vp, hash);
 2452 
 2453         /*
 2454          * See if this vnode or negative entry is already in the cache
 2455          * with this name.  This can happen with concurrent lookups of
 2456          * the same path name.
 2457          */
 2458         ncpp = NCHHASH(hash);
 2459         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
 2460                 if (n2->nc_dvp == dvp &&
 2461                     n2->nc_nlen == cnp->cn_namelen &&
 2462                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 2463                         MPASS(cache_ncp_canuse(n2));
 2464                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
 2465                                 KASSERT(vp == NULL,
 2466                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 2467                                     __func__, NULL, vp, cnp->cn_nameptr));
 2468                         else
 2469                                 KASSERT(n2->nc_vp == vp,
 2470                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 2471                                     __func__, n2->nc_vp, vp, cnp->cn_nameptr));
 2472                         /*
 2473                          * Entries are supposed to be immutable unless in the
 2474                          * process of getting destroyed. Accommodating for
 2475                          * changing timestamps is possible but not worth it.
 2476                          * This should be harmless in terms of correctness, in
 2477                          * the worst case resulting in an earlier expiration.
 2478                          * Alternatively, the found entry can be replaced
 2479                          * altogether.
 2480                          */
 2481                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
 2482 #if 0
 2483                         if (tsp != NULL) {
 2484                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
 2485                                     ("no NCF_TS"));
 2486                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 2487                                 n2_ts->nc_time = ncp_ts->nc_time;
 2488                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
 2489                                 if (dtsp != NULL) {
 2490                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 2491                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
 2492                                 }
 2493                         }
 2494 #endif
 2495                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
 2496                             vp);
 2497                         goto out_unlock_free;
 2498                 }
 2499         }
 2500 
 2501         if (flag == NCF_ISDOTDOT) {
 2502                 /*
 2503                  * See if we are trying to add .. entry, but some other lookup
 2504                  * has populated v_cache_dd pointer already.
 2505                  */
 2506                 if (dvp->v_cache_dd != NULL)
 2507                         goto out_unlock_free;
 2508                 KASSERT(vp == NULL || vp->v_type == VDIR,
 2509                     ("wrong vnode type %p", vp));
 2510                 atomic_thread_fence_rel();
 2511                 atomic_store_ptr(&dvp->v_cache_dd, ncp);
 2512         }
 2513 
 2514         if (vp != NULL) {
 2515                 if (flag != NCF_ISDOTDOT) {
 2516                         /*
 2517                          * For this case, the cache entry maps both the
 2518                          * directory name in it and the name ".." for the
 2519                          * directory's parent.
 2520                          */
 2521                         if ((ndd = vp->v_cache_dd) != NULL) {
 2522                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 2523                                         cache_zap_locked(ndd);
 2524                                 else
 2525                                         ndd = NULL;
 2526                         }
 2527                         atomic_thread_fence_rel();
 2528                         atomic_store_ptr(&vp->v_cache_dd, ncp);
 2529                 } else if (vp->v_type != VDIR) {
 2530                         if (vp->v_cache_dd != NULL) {
 2531                                 atomic_store_ptr(&vp->v_cache_dd, NULL);
 2532                         }
 2533                 }
 2534         }
 2535 
 2536         if (flag != NCF_ISDOTDOT) {
 2537                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 2538                         cache_hold_vnode(dvp);
 2539                 }
 2540                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 2541         }
 2542 
 2543         /*
 2544          * If the entry is "negative", we place it into the
 2545          * "negative" cache queue, otherwise, we place it into the
 2546          * destination vnode's cache entries queue.
 2547          */
 2548         if (vp != NULL) {
 2549                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 2550                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 2551                     vp);
 2552         } else {
 2553                 if (cnp->cn_flags & ISWHITEOUT)
 2554                         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
 2555                 cache_neg_insert(ncp);
 2556                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 2557                     ncp->nc_name);
 2558         }
 2559 
 2560         /*
 2561          * Insert the new namecache entry into the appropriate chain
 2562          * within the cache entries table.
 2563          */
 2564         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 2565 
 2566         atomic_thread_fence_rel();
 2567         /*
 2568          * Mark the entry as fully constructed.
 2569          * It is immutable past this point until its removal.
 2570          */
 2571         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
 2572 
 2573         cache_enter_unlock(&cel);
 2574         if (ndd != NULL)
 2575                 cache_free(ndd);
 2576         return;
 2577 out_unlock_free:
 2578         cache_enter_unlock(&cel);
 2579         cache_free(ncp);
 2580         return;
 2581 }
 2582 
 2583 /*
 2584  * A variant of the above accepting flags.
 2585  *
 2586  * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
 2587  *
 2588  * TODO: this routine is a hack. It blindly removes the old entry, even if it
 2589  * happens to match and it is doing it in an inefficient manner. It was added
 2590  * to accomodate NFS which runs into a case where the target for a given name
 2591  * may change from under it. Note this does nothing to solve the following
 2592  * race: 2 callers of cache_enter_time_flags pass a different target vnode for
 2593  * the same [dvp, cnp]. It may be argued that code doing this is broken.
 2594  */
 2595 void
 2596 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 2597     struct timespec *tsp, struct timespec *dtsp, int flags)
 2598 {
 2599 
 2600         MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
 2601 
 2602         if (flags & VFS_CACHE_DROPOLD)
 2603                 cache_remove_cnp(dvp, cnp);
 2604         cache_enter_time(dvp, vp, cnp, tsp, dtsp);
 2605 }
 2606 
 2607 static u_int
 2608 cache_roundup_2(u_int val)
 2609 {
 2610         u_int res;
 2611 
 2612         for (res = 1; res <= val; res <<= 1)
 2613                 continue;
 2614 
 2615         return (res);
 2616 }
 2617 
 2618 static struct nchashhead *
 2619 nchinittbl(u_long elements, u_long *hashmask)
 2620 {
 2621         struct nchashhead *hashtbl;
 2622         u_long hashsize, i;
 2623 
 2624         hashsize = cache_roundup_2(elements) / 2;
 2625 
 2626         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
 2627         for (i = 0; i < hashsize; i++)
 2628                 CK_SLIST_INIT(&hashtbl[i]);
 2629         *hashmask = hashsize - 1;
 2630         return (hashtbl);
 2631 }
 2632 
 2633 static void
 2634 ncfreetbl(struct nchashhead *hashtbl)
 2635 {
 2636 
 2637         free(hashtbl, M_VFSCACHE);
 2638 }
 2639 
 2640 /*
 2641  * Name cache initialization, from vfs_init() when we are booting
 2642  */
 2643 static void
 2644 nchinit(void *dummy __unused)
 2645 {
 2646         u_int i;
 2647 
 2648         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
 2649             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2650         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
 2651             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2652         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
 2653             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2654         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
 2655             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 2656 
 2657         VFS_SMR_ZONE_SET(cache_zone_small);
 2658         VFS_SMR_ZONE_SET(cache_zone_small_ts);
 2659         VFS_SMR_ZONE_SET(cache_zone_large);
 2660         VFS_SMR_ZONE_SET(cache_zone_large_ts);
 2661 
 2662         ncsize = desiredvnodes * ncsizefactor;
 2663         cache_recalc_neg_min(ncnegminpct);
 2664         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
 2665         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
 2666         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
 2667                 ncbuckethash = 7;
 2668         if (ncbuckethash > nchash)
 2669                 ncbuckethash = nchash;
 2670         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 2671             M_WAITOK | M_ZERO);
 2672         for (i = 0; i < numbucketlocks; i++)
 2673                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
 2674         ncvnodehash = ncbuckethash;
 2675         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 2676             M_WAITOK | M_ZERO);
 2677         for (i = 0; i < numvnodelocks; i++)
 2678                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 2679 
 2680         for (i = 0; i < numneglists; i++) {
 2681                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
 2682                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 2683                 TAILQ_INIT(&neglists[i].nl_list);
 2684                 TAILQ_INIT(&neglists[i].nl_hotlist);
 2685         }
 2686 }
 2687 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 2688 
 2689 void
 2690 cache_vnode_init(struct vnode *vp)
 2691 {
 2692 
 2693         LIST_INIT(&vp->v_cache_src);
 2694         TAILQ_INIT(&vp->v_cache_dst);
 2695         vp->v_cache_dd = NULL;
 2696         cache_prehash(vp);
 2697 }
 2698 
 2699 /*
 2700  * Induce transient cache misses for lockless operation in cache_lookup() by
 2701  * using a temporary hash table.
 2702  *
 2703  * This will force a fs lookup.
 2704  *
 2705  * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
 2706  * to observe all CPUs not performing the lookup.
 2707  */
 2708 static void
 2709 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
 2710 {
 2711 
 2712         MPASS(temphash < nchash);
 2713         /*
 2714          * Change the size. The new size is smaller and can safely be used
 2715          * against the existing table. All lookups which now hash wrong will
 2716          * result in a cache miss, which all callers are supposed to know how
 2717          * to handle.
 2718          */
 2719         atomic_store_long(&nchash, temphash);
 2720         atomic_thread_fence_rel();
 2721         vfs_smr_synchronize();
 2722         /*
 2723          * At this point everyone sees the updated hash value, but they still
 2724          * see the old table.
 2725          */
 2726         atomic_store_ptr(&nchashtbl, temptbl);
 2727         atomic_thread_fence_rel();
 2728         vfs_smr_synchronize();
 2729         /*
 2730          * At this point everyone sees the updated table pointer and size pair.
 2731          */
 2732 }
 2733 
 2734 /*
 2735  * Set the new hash table.
 2736  *
 2737  * Similarly to cache_changesize_set_temp(), this has to synchronize against
 2738  * lockless operation in cache_lookup().
 2739  */
 2740 static void
 2741 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
 2742 {
 2743 
 2744         MPASS(nchash < new_hash);
 2745         /*
 2746          * Change the pointer first. This wont result in out of bounds access
 2747          * since the temporary table is guaranteed to be smaller.
 2748          */
 2749         atomic_store_ptr(&nchashtbl, new_tbl);
 2750         atomic_thread_fence_rel();
 2751         vfs_smr_synchronize();
 2752         /*
 2753          * At this point everyone sees the updated pointer value, but they
 2754          * still see the old size.
 2755          */
 2756         atomic_store_long(&nchash, new_hash);
 2757         atomic_thread_fence_rel();
 2758         vfs_smr_synchronize();
 2759         /*
 2760          * At this point everyone sees the updated table pointer and size pair.
 2761          */
 2762 }
 2763 
 2764 void
 2765 cache_changesize(u_long newmaxvnodes)
 2766 {
 2767         struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
 2768         u_long new_nchash, old_nchash, temphash;
 2769         struct namecache *ncp;
 2770         uint32_t hash;
 2771         u_long newncsize;
 2772         int i;
 2773 
 2774         newncsize = newmaxvnodes * ncsizefactor;
 2775         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 2776         if (newmaxvnodes < numbucketlocks)
 2777                 newmaxvnodes = numbucketlocks;
 2778 
 2779         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
 2780         /* If same hash table size, nothing to do */
 2781         if (nchash == new_nchash) {
 2782                 ncfreetbl(new_nchashtbl);
 2783                 return;
 2784         }
 2785 
 2786         temptbl = nchinittbl(1, &temphash);
 2787 
 2788         /*
 2789          * Move everything from the old hash table to the new table.
 2790          * None of the namecache entries in the table can be removed
 2791          * because to do so, they have to be removed from the hash table.
 2792          */
 2793         cache_lock_all_vnodes();
 2794         cache_lock_all_buckets();
 2795         old_nchashtbl = nchashtbl;
 2796         old_nchash = nchash;
 2797         cache_changesize_set_temp(temptbl, temphash);
 2798         for (i = 0; i <= old_nchash; i++) {
 2799                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
 2800                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 2801                             ncp->nc_dvp);
 2802                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
 2803                         CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
 2804                 }
 2805         }
 2806         ncsize = newncsize;
 2807         cache_recalc_neg_min(ncnegminpct);
 2808         cache_changesize_set_new(new_nchashtbl, new_nchash);
 2809         cache_unlock_all_buckets();
 2810         cache_unlock_all_vnodes();
 2811         ncfreetbl(old_nchashtbl);
 2812         ncfreetbl(temptbl);
 2813 }
 2814 
 2815 /*
 2816  * Remove all entries from and to a particular vnode.
 2817  */
 2818 static void
 2819 cache_purge_impl(struct vnode *vp)
 2820 {
 2821         struct cache_freebatch batch;
 2822         struct namecache *ncp;
 2823         struct mtx *vlp, *vlp2;
 2824 
 2825         TAILQ_INIT(&batch);
 2826         vlp = VP2VNODELOCK(vp);
 2827         vlp2 = NULL;
 2828         mtx_lock(vlp);
 2829 retry:
 2830         while (!LIST_EMPTY(&vp->v_cache_src)) {
 2831                 ncp = LIST_FIRST(&vp->v_cache_src);
 2832                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2833                         goto retry;
 2834                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2835         }
 2836         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 2837                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 2838                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2839                         goto retry;
 2840                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2841         }
 2842         ncp = vp->v_cache_dd;
 2843         if (ncp != NULL) {
 2844                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 2845                    ("lost dotdot link"));
 2846                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2847                         goto retry;
 2848                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2849         }
 2850         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 2851         mtx_unlock(vlp);
 2852         if (vlp2 != NULL)
 2853                 mtx_unlock(vlp2);
 2854         cache_free_batch(&batch);
 2855 }
 2856 
 2857 /*
 2858  * Opportunistic check to see if there is anything to do.
 2859  */
 2860 static bool
 2861 cache_has_entries(struct vnode *vp)
 2862 {
 2863 
 2864         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 2865             atomic_load_ptr(&vp->v_cache_dd) == NULL)
 2866                 return (false);
 2867         return (true);
 2868 }
 2869 
 2870 void
 2871 cache_purge(struct vnode *vp)
 2872 {
 2873 
 2874         SDT_PROBE1(vfs, namecache, purge, done, vp);
 2875         if (!cache_has_entries(vp))
 2876                 return;
 2877         cache_purge_impl(vp);
 2878 }
 2879 
 2880 /*
 2881  * Only to be used by vgone.
 2882  */
 2883 void
 2884 cache_purge_vgone(struct vnode *vp)
 2885 {
 2886         struct mtx *vlp;
 2887 
 2888         VNPASS(VN_IS_DOOMED(vp), vp);
 2889         if (cache_has_entries(vp)) {
 2890                 cache_purge_impl(vp);
 2891                 return;
 2892         }
 2893 
 2894         /*
 2895          * Serialize against a potential thread doing cache_purge.
 2896          */
 2897         vlp = VP2VNODELOCK(vp);
 2898         mtx_wait_unlocked(vlp);
 2899         if (cache_has_entries(vp)) {
 2900                 cache_purge_impl(vp);
 2901                 return;
 2902         }
 2903         return;
 2904 }
 2905 
 2906 /*
 2907  * Remove all negative entries for a particular directory vnode.
 2908  */
 2909 void
 2910 cache_purge_negative(struct vnode *vp)
 2911 {
 2912         struct cache_freebatch batch;
 2913         struct namecache *ncp, *nnp;
 2914         struct mtx *vlp;
 2915 
 2916         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 2917         if (LIST_EMPTY(&vp->v_cache_src))
 2918                 return;
 2919         TAILQ_INIT(&batch);
 2920         vlp = VP2VNODELOCK(vp);
 2921         mtx_lock(vlp);
 2922         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 2923                 if (!(ncp->nc_flag & NCF_NEGATIVE))
 2924                         continue;
 2925                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 2926                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 2927         }
 2928         mtx_unlock(vlp);
 2929         cache_free_batch(&batch);
 2930 }
 2931 
 2932 /*
 2933  * Entry points for modifying VOP operations.
 2934  */
 2935 void
 2936 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
 2937     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
 2938 {
 2939 
 2940         ASSERT_VOP_IN_SEQC(fdvp);
 2941         ASSERT_VOP_IN_SEQC(fvp);
 2942         ASSERT_VOP_IN_SEQC(tdvp);
 2943         if (tvp != NULL)
 2944                 ASSERT_VOP_IN_SEQC(tvp);
 2945 
 2946         cache_purge(fvp);
 2947         if (tvp != NULL) {
 2948                 cache_purge(tvp);
 2949                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
 2950                     ("%s: lingering negative entry", __func__));
 2951         } else {
 2952                 cache_remove_cnp(tdvp, tcnp);
 2953         }
 2954 
 2955         /*
 2956          * TODO
 2957          *
 2958          * Historically renaming was always purging all revelang entries,
 2959          * but that's quite wasteful. In particular turns out that in many cases
 2960          * the target file is immediately accessed after rename, inducing a cache
 2961          * miss.
 2962          *
 2963          * Recode this to reduce relocking and reuse the existing entry (if any)
 2964          * instead of just removing it above and allocating a new one here.
 2965          */
 2966         if (cache_rename_add) {
 2967                 cache_enter(tdvp, fvp, tcnp);
 2968         }
 2969 }
 2970 
 2971 void
 2972 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
 2973 {
 2974 
 2975         ASSERT_VOP_IN_SEQC(dvp);
 2976         ASSERT_VOP_IN_SEQC(vp);
 2977         cache_purge(vp);
 2978 }
 2979 
 2980 #ifdef INVARIANTS
 2981 /*
 2982  * Validate that if an entry exists it matches.
 2983  */
 2984 void
 2985 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 2986 {
 2987         struct namecache *ncp;
 2988         struct mtx *blp;
 2989         uint32_t hash;
 2990 
 2991         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 2992         if (CK_SLIST_EMPTY(NCHHASH(hash)))
 2993                 return;
 2994         blp = HASH2BUCKETLOCK(hash);
 2995         mtx_lock(blp);
 2996         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 2997                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 2998                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
 2999                         if (ncp->nc_vp != vp)
 3000                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
 3001                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
 3002                 }
 3003         }
 3004         mtx_unlock(blp);
 3005 }
 3006 #endif
 3007 
 3008 /*
 3009  * Flush all entries referencing a particular filesystem.
 3010  */
 3011 void
 3012 cache_purgevfs(struct mount *mp)
 3013 {
 3014         struct vnode *vp, *mvp;
 3015         size_t visited, purged;
 3016 
 3017         visited = purged = 0;
 3018         /*
 3019          * Somewhat wasteful iteration over all vnodes. Would be better to
 3020          * support filtering and avoid the interlock to begin with.
 3021          */
 3022         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 3023                 visited++;
 3024                 if (!cache_has_entries(vp)) {
 3025                         VI_UNLOCK(vp);
 3026                         continue;
 3027                 }
 3028                 vholdl(vp);
 3029                 VI_UNLOCK(vp);
 3030                 cache_purge(vp);
 3031                 purged++;
 3032                 vdrop(vp);
 3033         }
 3034 
 3035         SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
 3036 }
 3037 
 3038 /*
 3039  * Perform canonical checks and cache lookup and pass on to filesystem
 3040  * through the vop_cachedlookup only if needed.
 3041  */
 3042 
 3043 int
 3044 vfs_cache_lookup(struct vop_lookup_args *ap)
 3045 {
 3046         struct vnode *dvp;
 3047         int error;
 3048         struct vnode **vpp = ap->a_vpp;
 3049         struct componentname *cnp = ap->a_cnp;
 3050         int flags = cnp->cn_flags;
 3051 
 3052         *vpp = NULL;
 3053         dvp = ap->a_dvp;
 3054 
 3055         if (dvp->v_type != VDIR)
 3056                 return (ENOTDIR);
 3057 
 3058         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 3059             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 3060                 return (EROFS);
 3061 
 3062         error = vn_dir_check_exec(dvp, cnp);
 3063         if (error != 0)
 3064                 return (error);
 3065 
 3066         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 3067         if (error == 0)
 3068                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 3069         if (error == -1)
 3070                 return (0);
 3071         return (error);
 3072 }
 3073 
 3074 /* Implementation of the getcwd syscall. */
 3075 int
 3076 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 3077 {
 3078         char *buf, *retbuf;
 3079         size_t buflen;
 3080         int error;
 3081 
 3082         buflen = uap->buflen;
 3083         if (__predict_false(buflen < 2))
 3084                 return (EINVAL);
 3085         if (buflen > MAXPATHLEN)
 3086                 buflen = MAXPATHLEN;
 3087 
 3088         buf = uma_zalloc(namei_zone, M_WAITOK);
 3089         error = vn_getcwd(buf, &retbuf, &buflen);
 3090         if (error == 0)
 3091                 error = copyout(retbuf, uap->buf, buflen);
 3092         uma_zfree(namei_zone, buf);
 3093         return (error);
 3094 }
 3095 
 3096 int
 3097 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
 3098 {
 3099         struct pwd *pwd;
 3100         int error;
 3101 
 3102         vfs_smr_enter();
 3103         pwd = pwd_get_smr();
 3104         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
 3105             buflen, 0);
 3106         VFS_SMR_ASSERT_NOT_ENTERED();
 3107         if (error < 0) {
 3108                 pwd = pwd_hold(curthread);
 3109                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
 3110                     retbuf, buflen);
 3111                 pwd_drop(pwd);
 3112         }
 3113 
 3114 #ifdef KTRACE
 3115         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
 3116                 ktrnamei(*retbuf);
 3117 #endif
 3118         return (error);
 3119 }
 3120 
 3121 static int
 3122 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
 3123     size_t size, int flags, enum uio_seg pathseg)
 3124 {
 3125         struct nameidata nd;
 3126         char *retbuf, *freebuf;
 3127         int error;
 3128 
 3129         if (flags != 0)
 3130                 return (EINVAL);
 3131         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
 3132             pathseg, path, fd, &cap_fstat_rights, td);
 3133         if ((error = namei(&nd)) != 0)
 3134                 return (error);
 3135         error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr,
 3136             nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size);
 3137         if (error == 0) {
 3138                 error = copyout(retbuf, buf, size);
 3139                 free(freebuf, M_TEMP);
 3140         }
 3141         NDFREE(&nd, 0);
 3142         return (error);
 3143 }
 3144 
 3145 int
 3146 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
 3147 {
 3148 
 3149         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
 3150             uap->flags, UIO_USERSPACE));
 3151 }
 3152 
 3153 /*
 3154  * Retrieve the full filesystem path that correspond to a vnode from the name
 3155  * cache (if available)
 3156  */
 3157 int
 3158 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
 3159 {
 3160         struct pwd *pwd;
 3161         char *buf;
 3162         size_t buflen;
 3163         int error;
 3164 
 3165         if (__predict_false(vp == NULL))
 3166                 return (EINVAL);
 3167 
 3168         buflen = MAXPATHLEN;
 3169         buf = malloc(buflen, M_TEMP, M_WAITOK);
 3170         vfs_smr_enter();
 3171         pwd = pwd_get_smr();
 3172         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
 3173         VFS_SMR_ASSERT_NOT_ENTERED();
 3174         if (error < 0) {
 3175                 pwd = pwd_hold(curthread);
 3176                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
 3177                 pwd_drop(pwd);
 3178         }
 3179         if (error == 0)
 3180                 *freebuf = buf;
 3181         else
 3182                 free(buf, M_TEMP);
 3183         return (error);
 3184 }
 3185 
 3186 /*
 3187  * This function is similar to vn_fullpath, but it attempts to lookup the
 3188  * pathname relative to the global root mount point.  This is required for the
 3189  * auditing sub-system, as audited pathnames must be absolute, relative to the
 3190  * global root mount point.
 3191  */
 3192 int
 3193 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
 3194 {
 3195         char *buf;
 3196         size_t buflen;
 3197         int error;
 3198 
 3199         if (__predict_false(vp == NULL))
 3200                 return (EINVAL);
 3201         buflen = MAXPATHLEN;
 3202         buf = malloc(buflen, M_TEMP, M_WAITOK);
 3203         vfs_smr_enter();
 3204         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
 3205         VFS_SMR_ASSERT_NOT_ENTERED();
 3206         if (error < 0) {
 3207                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
 3208         }
 3209         if (error == 0)
 3210                 *freebuf = buf;
 3211         else
 3212                 free(buf, M_TEMP);
 3213         return (error);
 3214 }
 3215 
 3216 static struct namecache *
 3217 vn_dd_from_dst(struct vnode *vp)
 3218 {
 3219         struct namecache *ncp;
 3220 
 3221         cache_assert_vnode_locked(vp);
 3222         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
 3223                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3224                         return (ncp);
 3225         }
 3226         return (NULL);
 3227 }
 3228 
 3229 int
 3230 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
 3231 {
 3232         struct vnode *dvp;
 3233         struct namecache *ncp;
 3234         struct mtx *vlp;
 3235         int error;
 3236 
 3237         vlp = VP2VNODELOCK(*vp);
 3238         mtx_lock(vlp);
 3239         ncp = (*vp)->v_cache_dd;
 3240         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
 3241                 KASSERT(ncp == vn_dd_from_dst(*vp),
 3242                     ("%s: mismatch for dd entry (%p != %p)", __func__,
 3243                     ncp, vn_dd_from_dst(*vp)));
 3244         } else {
 3245                 ncp = vn_dd_from_dst(*vp);
 3246         }
 3247         if (ncp != NULL) {
 3248                 if (*buflen < ncp->nc_nlen) {
 3249                         mtx_unlock(vlp);
 3250                         vrele(*vp);
 3251                         counter_u64_add(numfullpathfail4, 1);
 3252                         error = ENOMEM;
 3253                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3254                             vp, NULL);
 3255                         return (error);
 3256                 }
 3257                 *buflen -= ncp->nc_nlen;
 3258                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3259                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 3260                     ncp->nc_name, vp);
 3261                 dvp = *vp;
 3262                 *vp = ncp->nc_dvp;
 3263                 vref(*vp);
 3264                 mtx_unlock(vlp);
 3265                 vrele(dvp);
 3266                 return (0);
 3267         }
 3268         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 3269 
 3270         mtx_unlock(vlp);
 3271         vn_lock(*vp, LK_SHARED | LK_RETRY);
 3272         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
 3273         vput(*vp);
 3274         if (error) {
 3275                 counter_u64_add(numfullpathfail2, 1);
 3276                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 3277                 return (error);
 3278         }
 3279 
 3280         *vp = dvp;
 3281         if (VN_IS_DOOMED(dvp)) {
 3282                 /* forced unmount */
 3283                 vrele(dvp);
 3284                 error = ENOENT;
 3285                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 3286                 return (error);
 3287         }
 3288         /*
 3289          * *vp has its use count incremented still.
 3290          */
 3291 
 3292         return (0);
 3293 }
 3294 
 3295 /*
 3296  * Resolve a directory to a pathname.
 3297  *
 3298  * The name of the directory can always be found in the namecache or fetched
 3299  * from the filesystem. There is also guaranteed to be only one parent, meaning
 3300  * we can just follow vnodes up until we find the root.
 3301  *
 3302  * The vnode must be referenced.
 3303  */
 3304 static int
 3305 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3306     size_t *len, size_t addend)
 3307 {
 3308 #ifdef KDTRACE_HOOKS
 3309         struct vnode *startvp = vp;
 3310 #endif
 3311         struct vnode *vp1;
 3312         size_t buflen;
 3313         int error;
 3314         bool slash_prefixed;
 3315 
 3316         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
 3317         VNPASS(vp->v_usecount > 0, vp);
 3318 
 3319         buflen = *len;
 3320 
 3321         slash_prefixed = true;
 3322         if (addend == 0) {
 3323                 MPASS(*len >= 2);
 3324                 buflen--;
 3325                 buf[buflen] = '\0';
 3326                 slash_prefixed = false;
 3327         }
 3328 
 3329         error = 0;
 3330 
 3331         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 3332         counter_u64_add(numfullpathcalls, 1);
 3333         while (vp != rdir && vp != rootvnode) {
 3334                 /*
 3335                  * The vp vnode must be already fully constructed,
 3336                  * since it is either found in namecache or obtained
 3337                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 3338                  * without obtaining the vnode lock.
 3339                  */
 3340                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3341                         vn_lock(vp, LK_RETRY | LK_SHARED);
 3342 
 3343                         /*
 3344                          * With the vnode locked, check for races with
 3345                          * unmount, forced or not.  Note that we
 3346                          * already verified that vp is not equal to
 3347                          * the root vnode, which means that
 3348                          * mnt_vnodecovered can be NULL only for the
 3349                          * case of unmount.
 3350                          */
 3351                         if (VN_IS_DOOMED(vp) ||
 3352                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 3353                             vp1->v_mountedhere != vp->v_mount) {
 3354                                 vput(vp);
 3355                                 error = ENOENT;
 3356                                 SDT_PROBE3(vfs, namecache, fullpath, return,
 3357                                     error, vp, NULL);
 3358                                 break;
 3359                         }
 3360 
 3361                         vref(vp1);
 3362                         vput(vp);
 3363                         vp = vp1;
 3364                         continue;
 3365                 }
 3366                 if (vp->v_type != VDIR) {
 3367                         vrele(vp);
 3368                         counter_u64_add(numfullpathfail1, 1);
 3369                         error = ENOTDIR;
 3370                         SDT_PROBE3(vfs, namecache, fullpath, return,
 3371                             error, vp, NULL);
 3372                         break;
 3373                 }
 3374                 error = vn_vptocnp(&vp, buf, &buflen);
 3375                 if (error)
 3376                         break;
 3377                 if (buflen == 0) {
 3378                         vrele(vp);
 3379                         error = ENOMEM;
 3380                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 3381                             startvp, NULL);
 3382                         break;
 3383                 }
 3384                 buf[--buflen] = '/';
 3385                 slash_prefixed = true;
 3386         }
 3387         if (error)
 3388                 return (error);
 3389         if (!slash_prefixed) {
 3390                 if (buflen == 0) {
 3391                         vrele(vp);
 3392                         counter_u64_add(numfullpathfail4, 1);
 3393                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 3394                             startvp, NULL);
 3395                         return (ENOMEM);
 3396                 }
 3397                 buf[--buflen] = '/';
 3398         }
 3399         counter_u64_add(numfullpathfound, 1);
 3400         vrele(vp);
 3401 
 3402         *retbuf = buf + buflen;
 3403         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
 3404         *len -= buflen;
 3405         *len += addend;
 3406         return (0);
 3407 }
 3408 
 3409 /*
 3410  * Resolve an arbitrary vnode to a pathname.
 3411  *
 3412  * Note 2 caveats:
 3413  * - hardlinks are not tracked, thus if the vnode is not a directory this can
 3414  *   resolve to a different path than the one used to find it
 3415  * - namecache is not mandatory, meaning names are not guaranteed to be added
 3416  *   (in which case resolving fails)
 3417  */
 3418 static void __inline
 3419 cache_rev_failed_impl(int *reason, int line)
 3420 {
 3421 
 3422         *reason = line;
 3423 }
 3424 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
 3425 
 3426 static int
 3427 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 3428     char **retbuf, size_t *buflen, size_t addend)
 3429 {
 3430 #ifdef KDTRACE_HOOKS
 3431         struct vnode *startvp = vp;
 3432 #endif
 3433         struct vnode *tvp;
 3434         struct mount *mp;
 3435         struct namecache *ncp;
 3436         size_t orig_buflen;
 3437         int reason;
 3438         int error;
 3439 #ifdef KDTRACE_HOOKS
 3440         int i;
 3441 #endif
 3442         seqc_t vp_seqc, tvp_seqc;
 3443         u_char nc_flag;
 3444 
 3445         VFS_SMR_ASSERT_ENTERED();
 3446 
 3447         if (!cache_fast_revlookup) {
 3448                 vfs_smr_exit();
 3449                 return (-1);
 3450         }
 3451 
 3452         orig_buflen = *buflen;
 3453 
 3454         if (addend == 0) {
 3455                 MPASS(*buflen >= 2);
 3456                 *buflen -= 1;
 3457                 buf[*buflen] = '\0';
 3458         }
 3459 
 3460         if (vp == rdir || vp == rootvnode) {
 3461                 if (addend == 0) {
 3462                         *buflen -= 1;
 3463                         buf[*buflen] = '/';
 3464                 }
 3465                 goto out_ok;
 3466         }
 3467 
 3468 #ifdef KDTRACE_HOOKS
 3469         i = 0;
 3470 #endif
 3471         error = -1;
 3472         ncp = NULL; /* for sdt probe down below */
 3473         vp_seqc = vn_seqc_read_any(vp);
 3474         if (seqc_in_modify(vp_seqc)) {
 3475                 cache_rev_failed(&reason);
 3476                 goto out_abort;
 3477         }
 3478 
 3479         for (;;) {
 3480 #ifdef KDTRACE_HOOKS
 3481                 i++;
 3482 #endif
 3483                 if ((vp->v_vflag & VV_ROOT) != 0) {
 3484                         mp = atomic_load_ptr(&vp->v_mount);
 3485                         if (mp == NULL) {
 3486                                 cache_rev_failed(&reason);
 3487                                 goto out_abort;
 3488                         }
 3489                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
 3490                         tvp_seqc = vn_seqc_read_any(tvp);
 3491                         if (seqc_in_modify(tvp_seqc)) {
 3492                                 cache_rev_failed(&reason);
 3493                                 goto out_abort;
 3494                         }
 3495                         if (!vn_seqc_consistent(vp, vp_seqc)) {
 3496                                 cache_rev_failed(&reason);
 3497                                 goto out_abort;
 3498                         }
 3499                         vp = tvp;
 3500                         vp_seqc = tvp_seqc;
 3501                         continue;
 3502                 }
 3503                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 3504                 if (ncp == NULL) {
 3505                         cache_rev_failed(&reason);
 3506                         goto out_abort;
 3507                 }
 3508                 nc_flag = atomic_load_char(&ncp->nc_flag);
 3509                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
 3510                         cache_rev_failed(&reason);
 3511                         goto out_abort;
 3512                 }
 3513                 if (ncp->nc_nlen >= *buflen) {
 3514                         cache_rev_failed(&reason);
 3515                         error = ENOMEM;
 3516                         goto out_abort;
 3517                 }
 3518                 *buflen -= ncp->nc_nlen;
 3519                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 3520                 *buflen -= 1;
 3521                 buf[*buflen] = '/';
 3522                 tvp = ncp->nc_dvp;
 3523                 tvp_seqc = vn_seqc_read_any(tvp);
 3524                 if (seqc_in_modify(tvp_seqc)) {
 3525                         cache_rev_failed(&reason);
 3526                         goto out_abort;
 3527                 }
 3528                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 3529                         cache_rev_failed(&reason);
 3530                         goto out_abort;
 3531                 }
 3532                 /*
 3533                  * Acquire fence provided by vn_seqc_read_any above.
 3534                  */
 3535                 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
 3536                         cache_rev_failed(&reason);
 3537                         goto out_abort;
 3538                 }
 3539                 if (!cache_ncp_canuse(ncp)) {
 3540                         cache_rev_failed(&reason);
 3541                         goto out_abort;
 3542                 }
 3543                 vp = tvp;
 3544                 vp_seqc = tvp_seqc;
 3545                 if (vp == rdir || vp == rootvnode)
 3546                         break;
 3547         }
 3548 out_ok:
 3549         vfs_smr_exit();
 3550         *retbuf = buf + *buflen;
 3551         *buflen = orig_buflen - *buflen + addend;
 3552         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
 3553         return (0);
 3554 
 3555 out_abort:
 3556         *buflen = orig_buflen;
 3557         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
 3558         vfs_smr_exit();
 3559         return (error);
 3560 }
 3561 
 3562 static int
 3563 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
 3564     size_t *buflen)
 3565 {
 3566         size_t orig_buflen, addend;
 3567         int error;
 3568 
 3569         if (*buflen < 2)
 3570                 return (EINVAL);
 3571 
 3572         orig_buflen = *buflen;
 3573 
 3574         vref(vp);
 3575         addend = 0;
 3576         if (vp->v_type != VDIR) {
 3577                 *buflen -= 1;
 3578                 buf[*buflen] = '\0';
 3579                 error = vn_vptocnp(&vp, buf, buflen);
 3580                 if (error)
 3581                         return (error);
 3582                 if (*buflen == 0) {
 3583                         vrele(vp);
 3584                         return (ENOMEM);
 3585                 }
 3586                 *buflen -= 1;
 3587                 buf[*buflen] = '/';
 3588                 addend = orig_buflen - *buflen;
 3589         }
 3590 
 3591         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
 3592 }
 3593 
 3594 /*
 3595  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
 3596  *
 3597  * Since the namecache does not track hardlinks, the caller is
 3598  * expected to first look up the target vnode with SAVENAME |
 3599  * WANTPARENT flags passed to namei to get dvp and vp.
 3600  *
 3601  * Then we have 2 cases:
 3602  * - if the found vnode is a directory, the path can be constructed just by
 3603  *   following names up the chain
 3604  * - otherwise we populate the buffer with the saved name and start resolving
 3605  *   from the parent
 3606  */
 3607 int
 3608 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
 3609     const char *hrdl_name, size_t hrdl_name_length,
 3610     char **retbuf, char **freebuf, size_t *buflen)
 3611 {
 3612         char *buf, *tmpbuf;
 3613         struct pwd *pwd;
 3614         size_t addend;
 3615         int error;
 3616         enum vtype type;
 3617 
 3618         if (*buflen < 2)
 3619                 return (EINVAL);
 3620         if (*buflen > MAXPATHLEN)
 3621                 *buflen = MAXPATHLEN;
 3622 
 3623         buf = malloc(*buflen, M_TEMP, M_WAITOK);
 3624 
 3625         addend = 0;
 3626 
 3627         /*
 3628          * Check for VBAD to work around the vp_crossmp bug in lookup().
 3629          *
 3630          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
 3631          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
 3632          * If the type is VDIR (like in this very case) we can skip looking
 3633          * at ni_dvp in the first place. However, since vnodes get passed here
 3634          * unlocked the target may transition to doomed state (type == VBAD)
 3635          * before we get to evaluate the condition. If this happens, we will
 3636          * populate part of the buffer and descend to vn_fullpath_dir with
 3637          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
 3638          *
 3639          * This should be atomic_load(&vp->v_type) but it is illegal to take
 3640          * an address of a bit field, even if said field is sized to char.
 3641          * Work around the problem by reading the value into a full-sized enum
 3642          * and then re-reading it with atomic_load which will still prevent
 3643          * the compiler from re-reading down the road.
 3644          */
 3645         type = vp->v_type;
 3646         type = atomic_load_int(&type);
 3647         if (type == VBAD) {
 3648                 error = ENOENT;
 3649                 goto out_bad;
 3650         }
 3651         if (type != VDIR) {
 3652                 addend = hrdl_name_length + 2;
 3653                 if (*buflen < addend) {
 3654                         error = ENOMEM;
 3655                         goto out_bad;
 3656                 }
 3657                 *buflen -= addend;
 3658                 tmpbuf = buf + *buflen;
 3659                 tmpbuf[0] = '/';
 3660                 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
 3661                 tmpbuf[addend - 1] = '\0';
 3662                 vp = dvp;
 3663         }
 3664 
 3665         vfs_smr_enter();
 3666         pwd = pwd_get_smr();
 3667         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3668             addend);
 3669         VFS_SMR_ASSERT_NOT_ENTERED();
 3670         if (error < 0) {
 3671                 pwd = pwd_hold(curthread);
 3672                 vref(vp);
 3673                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 3674                     addend);
 3675                 pwd_drop(pwd);
 3676         }
 3677         if (error != 0)
 3678                 goto out_bad;
 3679 
 3680         *freebuf = buf;
 3681 
 3682         return (0);
 3683 out_bad:
 3684         free(buf, M_TEMP);
 3685         return (error);
 3686 }
 3687 
 3688 struct vnode *
 3689 vn_dir_dd_ino(struct vnode *vp)
 3690 {
 3691         struct namecache *ncp;
 3692         struct vnode *ddvp;
 3693         struct mtx *vlp;
 3694         enum vgetstate vs;
 3695 
 3696         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 3697         vlp = VP2VNODELOCK(vp);
 3698         mtx_lock(vlp);
 3699         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 3700                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 3701                         continue;
 3702                 ddvp = ncp->nc_dvp;
 3703                 vs = vget_prep(ddvp);
 3704                 mtx_unlock(vlp);
 3705                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
 3706                         return (NULL);
 3707                 return (ddvp);
 3708         }
 3709         mtx_unlock(vlp);
 3710         return (NULL);
 3711 }
 3712 
 3713 int
 3714 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 3715 {
 3716         struct namecache *ncp;
 3717         struct mtx *vlp;
 3718         int l;
 3719 
 3720         vlp = VP2VNODELOCK(vp);
 3721         mtx_lock(vlp);
 3722         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 3723                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 3724                         break;
 3725         if (ncp == NULL) {
 3726                 mtx_unlock(vlp);
 3727                 return (ENOENT);
 3728         }
 3729         l = min(ncp->nc_nlen, buflen - 1);
 3730         memcpy(buf, ncp->nc_name, l);
 3731         mtx_unlock(vlp);
 3732         buf[l] = '\0';
 3733         return (0);
 3734 }
 3735 
 3736 /*
 3737  * This function updates path string to vnode's full global path
 3738  * and checks the size of the new path string against the pathlen argument.
 3739  *
 3740  * Requires a locked, referenced vnode.
 3741  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 3742  *
 3743  * If vp is a directory, the call to vn_fullpath_global() always succeeds
 3744  * because it falls back to the ".." lookup if the namecache lookup fails.
 3745  */
 3746 int
 3747 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
 3748     u_int pathlen)
 3749 {
 3750         struct nameidata nd;
 3751         struct vnode *vp1;
 3752         char *rpath, *fbuf;
 3753         int error;
 3754 
 3755         ASSERT_VOP_ELOCKED(vp, __func__);
 3756 
 3757         /* Construct global filesystem path from vp. */
 3758         VOP_UNLOCK(vp);
 3759         error = vn_fullpath_global(vp, &rpath, &fbuf);
 3760 
 3761         if (error != 0) {
 3762                 vrele(vp);
 3763                 return (error);
 3764         }
 3765 
 3766         if (strlen(rpath) >= pathlen) {
 3767                 vrele(vp);
 3768                 error = ENAMETOOLONG;
 3769                 goto out;
 3770         }
 3771 
 3772         /*
 3773          * Re-lookup the vnode by path to detect a possible rename.
 3774          * As a side effect, the vnode is relocked.
 3775          * If vnode was renamed, return ENOENT.
 3776          */
 3777         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 3778             UIO_SYSSPACE, path, td);
 3779         error = namei(&nd);
 3780         if (error != 0) {
 3781                 vrele(vp);
 3782                 goto out;
 3783         }
 3784         NDFREE(&nd, NDF_ONLY_PNBUF);
 3785         vp1 = nd.ni_vp;
 3786         vrele(vp);
 3787         if (vp1 == vp)
 3788                 strcpy(path, rpath);
 3789         else {
 3790                 vput(vp1);
 3791                 error = ENOENT;
 3792         }
 3793 
 3794 out:
 3795         free(fbuf, M_TEMP);
 3796         return (error);
 3797 }
 3798 
 3799 #ifdef DDB
 3800 static void
 3801 db_print_vpath(struct vnode *vp)
 3802 {
 3803 
 3804         while (vp != NULL) {
 3805                 db_printf("%p: ", vp);
 3806                 if (vp == rootvnode) {
 3807                         db_printf("/");
 3808                         vp = NULL;
 3809                 } else {
 3810                         if (vp->v_vflag & VV_ROOT) {
 3811                                 db_printf("<mount point>");
 3812                                 vp = vp->v_mount->mnt_vnodecovered;
 3813                         } else {
 3814                                 struct namecache *ncp;
 3815                                 char *ncn;
 3816                                 int i;
 3817 
 3818                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 3819                                 if (ncp != NULL) {
 3820                                         ncn = ncp->nc_name;
 3821                                         for (i = 0; i < ncp->nc_nlen; i++)
 3822                                                 db_printf("%c", *ncn++);
 3823                                         vp = ncp->nc_dvp;
 3824                                 } else {
 3825                                         vp = NULL;
 3826                                 }
 3827                         }
 3828                 }
 3829                 db_printf("\n");
 3830         }
 3831 
 3832         return;
 3833 }
 3834 
 3835 DB_SHOW_COMMAND(vpath, db_show_vpath)
 3836 {
 3837         struct vnode *vp;
 3838 
 3839         if (!have_addr) {
 3840                 db_printf("usage: show vpath <struct vnode *>\n");
 3841                 return;
 3842         }
 3843 
 3844         vp = (struct vnode *)addr;
 3845         db_print_vpath(vp);
 3846 }
 3847 
 3848 #endif
 3849 
 3850 static int cache_fast_lookup = 1;
 3851 static char __read_frequently cache_fast_lookup_enabled = true;
 3852 
 3853 #define CACHE_FPL_FAILED        -2020
 3854 
 3855 void
 3856 cache_fast_lookup_enabled_recalc(void)
 3857 {
 3858         int lookup_flag;
 3859         int mac_on;
 3860 
 3861 #ifdef MAC
 3862         mac_on = mac_vnode_check_lookup_enabled();
 3863         mac_on |= mac_vnode_check_readlink_enabled();
 3864 #else
 3865         mac_on = 0;
 3866 #endif
 3867 
 3868         lookup_flag = atomic_load_int(&cache_fast_lookup);
 3869         if (lookup_flag && !mac_on) {
 3870                 atomic_store_char(&cache_fast_lookup_enabled, true);
 3871         } else {
 3872                 atomic_store_char(&cache_fast_lookup_enabled, false);
 3873         }
 3874 }
 3875 
 3876 static int
 3877 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
 3878 {
 3879         int error, old;
 3880 
 3881         old = atomic_load_int(&cache_fast_lookup);
 3882         error = sysctl_handle_int(oidp, arg1, arg2, req);
 3883         if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
 3884                 cache_fast_lookup_enabled_recalc();
 3885         return (error);
 3886 }
 3887 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
 3888     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
 3889 
 3890 /*
 3891  * Components of nameidata (or objects it can point to) which may
 3892  * need restoring in case fast path lookup fails.
 3893  */
 3894 struct nameidata_outer {
 3895         size_t ni_pathlen;
 3896         int cn_flags;
 3897 };
 3898 
 3899 struct nameidata_saved {
 3900 #ifdef INVARIANTS
 3901         char *cn_nameptr;
 3902         size_t ni_pathlen;
 3903 #endif
 3904 };
 3905 
 3906 #ifdef INVARIANTS
 3907 struct cache_fpl_debug {
 3908         size_t ni_pathlen;
 3909 };
 3910 #endif
 3911 
 3912 struct cache_fpl {
 3913         struct nameidata *ndp;
 3914         struct componentname *cnp;
 3915         char *nulchar;
 3916         struct vnode *dvp;
 3917         struct vnode *tvp;
 3918         seqc_t dvp_seqc;
 3919         seqc_t tvp_seqc;
 3920         uint32_t hash;
 3921         struct nameidata_saved snd;
 3922         struct nameidata_outer snd_outer;
 3923         int line;
 3924         enum cache_fpl_status status:8;
 3925         bool in_smr;
 3926         bool fsearch;
 3927         bool savename;
 3928         struct pwd **pwd;
 3929 #ifdef INVARIANTS
 3930         struct cache_fpl_debug debug;
 3931 #endif
 3932 };
 3933 
 3934 static bool cache_fplookup_mp_supported(struct mount *mp);
 3935 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
 3936 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
 3937 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
 3938 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
 3939 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
 3940 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
 3941 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
 3942 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
 3943 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
 3944 
 3945 static void
 3946 cache_fpl_cleanup_cnp(struct componentname *cnp)
 3947 {
 3948 
 3949         uma_zfree(namei_zone, cnp->cn_pnbuf);
 3950 #ifdef DIAGNOSTIC
 3951         cnp->cn_pnbuf = NULL;
 3952         cnp->cn_nameptr = NULL;
 3953 #endif
 3954 }
 3955 
 3956 static struct vnode *
 3957 cache_fpl_handle_root(struct cache_fpl *fpl)
 3958 {
 3959         struct nameidata *ndp;
 3960         struct componentname *cnp;
 3961 
 3962         ndp = fpl->ndp;
 3963         cnp = fpl->cnp;
 3964 
 3965         MPASS(*(cnp->cn_nameptr) == '/');
 3966         cnp->cn_nameptr++;
 3967         cache_fpl_pathlen_dec(fpl);
 3968 
 3969         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 3970                 do {
 3971                         cnp->cn_nameptr++;
 3972                         cache_fpl_pathlen_dec(fpl);
 3973                 } while (*(cnp->cn_nameptr) == '/');
 3974         }
 3975 
 3976         return (ndp->ni_rootdir);
 3977 }
 3978 
 3979 static void
 3980 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
 3981 {
 3982 
 3983         fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
 3984         fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
 3985 }
 3986 
 3987 static void
 3988 cache_fpl_checkpoint(struct cache_fpl *fpl)
 3989 {
 3990 
 3991 #ifdef INVARIANTS
 3992         fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
 3993         fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
 3994 #endif
 3995 }
 3996 
 3997 static void
 3998 cache_fpl_restore_partial(struct cache_fpl *fpl)
 3999 {
 4000 
 4001         fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
 4002 #ifdef INVARIANTS
 4003         fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
 4004 #endif
 4005 }
 4006 
 4007 static void
 4008 cache_fpl_restore_abort(struct cache_fpl *fpl)
 4009 {
 4010 
 4011         cache_fpl_restore_partial(fpl);
 4012         /*
 4013          * It is 0 on entry by API contract.
 4014          */
 4015         fpl->ndp->ni_resflags = 0;
 4016         fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
 4017         fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
 4018 }
 4019 
 4020 #ifdef INVARIANTS
 4021 #define cache_fpl_smr_assert_entered(fpl) ({                    \
 4022         struct cache_fpl *_fpl = (fpl);                         \
 4023         MPASS(_fpl->in_smr == true);                            \
 4024         VFS_SMR_ASSERT_ENTERED();                               \
 4025 })
 4026 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
 4027         struct cache_fpl *_fpl = (fpl);                         \
 4028         MPASS(_fpl->in_smr == false);                           \
 4029         VFS_SMR_ASSERT_NOT_ENTERED();                           \
 4030 })
 4031 static void
 4032 cache_fpl_assert_status(struct cache_fpl *fpl)
 4033 {
 4034 
 4035         switch (fpl->status) {
 4036         case CACHE_FPL_STATUS_UNSET:
 4037                 __assert_unreachable();
 4038                 break;
 4039         case CACHE_FPL_STATUS_DESTROYED:
 4040         case CACHE_FPL_STATUS_ABORTED:
 4041         case CACHE_FPL_STATUS_PARTIAL:
 4042         case CACHE_FPL_STATUS_HANDLED:
 4043                 break;
 4044         }
 4045 }
 4046 #else
 4047 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
 4048 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
 4049 #define cache_fpl_assert_status(fpl) do { } while (0)
 4050 #endif
 4051 
 4052 #define cache_fpl_smr_enter_initial(fpl) ({                     \
 4053         struct cache_fpl *_fpl = (fpl);                         \
 4054         vfs_smr_enter();                                        \
 4055         _fpl->in_smr = true;                                    \
 4056 })
 4057 
 4058 #define cache_fpl_smr_enter(fpl) ({                             \
 4059         struct cache_fpl *_fpl = (fpl);                         \
 4060         MPASS(_fpl->in_smr == false);                           \
 4061         vfs_smr_enter();                                        \
 4062         _fpl->in_smr = true;                                    \
 4063 })
 4064 
 4065 #define cache_fpl_smr_exit(fpl) ({                              \
 4066         struct cache_fpl *_fpl = (fpl);                         \
 4067         MPASS(_fpl->in_smr == true);                            \
 4068         vfs_smr_exit();                                         \
 4069         _fpl->in_smr = false;                                   \
 4070 })
 4071 
 4072 static int
 4073 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
 4074 {
 4075 
 4076         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 4077                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 4078                     ("%s: converting to abort from %d at %d, set at %d\n",
 4079                     __func__, fpl->status, line, fpl->line));
 4080         }
 4081         cache_fpl_smr_assert_not_entered(fpl);
 4082         fpl->status = CACHE_FPL_STATUS_ABORTED;
 4083         fpl->line = line;
 4084         return (CACHE_FPL_FAILED);
 4085 }
 4086 
 4087 #define cache_fpl_aborted_early(x)      cache_fpl_aborted_early_impl((x), __LINE__)
 4088 
 4089 static int __noinline
 4090 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 4091 {
 4092         struct nameidata *ndp;
 4093         struct componentname *cnp;
 4094 
 4095         ndp = fpl->ndp;
 4096         cnp = fpl->cnp;
 4097 
 4098         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 4099                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 4100                     ("%s: converting to abort from %d at %d, set at %d\n",
 4101                     __func__, fpl->status, line, fpl->line));
 4102         }
 4103         fpl->status = CACHE_FPL_STATUS_ABORTED;
 4104         fpl->line = line;
 4105         if (fpl->in_smr)
 4106                 cache_fpl_smr_exit(fpl);
 4107         cache_fpl_restore_abort(fpl);
 4108         /*
 4109          * Resolving symlinks overwrites data passed by the caller.
 4110          * Let namei know.
 4111          */
 4112         if (ndp->ni_loopcnt > 0) {
 4113                 fpl->status = CACHE_FPL_STATUS_DESTROYED;
 4114                 cache_fpl_cleanup_cnp(cnp);
 4115         }
 4116         return (CACHE_FPL_FAILED);
 4117 }
 4118 
 4119 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
 4120 
 4121 static int __noinline
 4122 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
 4123 {
 4124 
 4125         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4126             ("%s: setting to partial at %d, but already set to %d at %d\n",
 4127             __func__, line, fpl->status, fpl->line));
 4128         cache_fpl_smr_assert_entered(fpl);
 4129         fpl->status = CACHE_FPL_STATUS_PARTIAL;
 4130         fpl->line = line;
 4131         return (cache_fplookup_partial_setup(fpl));
 4132 }
 4133 
 4134 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
 4135 
 4136 static int
 4137 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
 4138 {
 4139 
 4140         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4141             ("%s: setting to handled at %d, but already set to %d at %d\n",
 4142             __func__, line, fpl->status, fpl->line));
 4143         cache_fpl_smr_assert_not_entered(fpl);
 4144         fpl->status = CACHE_FPL_STATUS_HANDLED;
 4145         fpl->line = line;
 4146         return (0);
 4147 }
 4148 
 4149 #define cache_fpl_handled(x)    cache_fpl_handled_impl((x), __LINE__)
 4150 
 4151 static int
 4152 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
 4153 {
 4154 
 4155         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 4156             ("%s: setting to handled at %d, but already set to %d at %d\n",
 4157             __func__, line, fpl->status, fpl->line));
 4158         MPASS(error != 0);
 4159         MPASS(error != CACHE_FPL_FAILED);
 4160         cache_fpl_smr_assert_not_entered(fpl);
 4161         fpl->status = CACHE_FPL_STATUS_HANDLED;
 4162         fpl->line = line;
 4163         fpl->dvp = NULL;
 4164         fpl->tvp = NULL;
 4165         fpl->savename = false;
 4166         return (error);
 4167 }
 4168 
 4169 #define cache_fpl_handled_error(x, e)   cache_fpl_handled_error_impl((x), (e), __LINE__)
 4170 
 4171 static bool
 4172 cache_fpl_terminated(struct cache_fpl *fpl)
 4173 {
 4174 
 4175         return (fpl->status != CACHE_FPL_STATUS_UNSET);
 4176 }
 4177 
 4178 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
 4179         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
 4180          FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \
 4181          ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
 4182 
 4183 #define CACHE_FPL_INTERNAL_CN_FLAGS \
 4184         (ISDOTDOT | MAKEENTRY | ISLASTCN)
 4185 
 4186 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 4187     "supported and internal flags overlap");
 4188 
 4189 static bool
 4190 cache_fpl_islastcn(struct nameidata *ndp)
 4191 {
 4192 
 4193         return (*ndp->ni_next == 0);
 4194 }
 4195 
 4196 static bool
 4197 cache_fpl_istrailingslash(struct cache_fpl *fpl)
 4198 {
 4199 
 4200         return (*(fpl->nulchar - 1) == '/');
 4201 }
 4202 
 4203 static bool
 4204 cache_fpl_isdotdot(struct componentname *cnp)
 4205 {
 4206 
 4207         if (cnp->cn_namelen == 2 &&
 4208             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 4209                 return (true);
 4210         return (false);
 4211 }
 4212 
 4213 static bool
 4214 cache_can_fplookup(struct cache_fpl *fpl)
 4215 {
 4216         struct nameidata *ndp;
 4217         struct componentname *cnp;
 4218         struct thread *td;
 4219 
 4220         ndp = fpl->ndp;
 4221         cnp = fpl->cnp;
 4222         td = cnp->cn_thread;
 4223 
 4224         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 4225                 cache_fpl_aborted_early(fpl);
 4226                 return (false);
 4227         }
 4228         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
 4229                 cache_fpl_aborted_early(fpl);
 4230                 return (false);
 4231         }
 4232         if (IN_CAPABILITY_MODE(td)) {
 4233                 cache_fpl_aborted_early(fpl);
 4234                 return (false);
 4235         }
 4236         if (AUDITING_TD(td)) {
 4237                 cache_fpl_aborted_early(fpl);
 4238                 return (false);
 4239         }
 4240         if (ndp->ni_startdir != NULL) {
 4241                 cache_fpl_aborted_early(fpl);
 4242                 return (false);
 4243         }
 4244         return (true);
 4245 }
 4246 
 4247 static int
 4248 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
 4249 {
 4250         struct nameidata *ndp;
 4251         int error;
 4252         bool fsearch;
 4253 
 4254         ndp = fpl->ndp;
 4255         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
 4256         if (__predict_false(error != 0)) {
 4257                 return (cache_fpl_aborted(fpl));
 4258         }
 4259         fpl->fsearch = fsearch;
 4260         return (0);
 4261 }
 4262 
 4263 static int __noinline
 4264 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
 4265     uint32_t hash)
 4266 {
 4267         struct componentname *cnp;
 4268         struct vnode *dvp;
 4269 
 4270         cnp = fpl->cnp;
 4271         dvp = fpl->dvp;
 4272 
 4273         cache_fpl_smr_exit(fpl);
 4274         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
 4275                 return (cache_fpl_handled_error(fpl, ENOENT));
 4276         else
 4277                 return (cache_fpl_aborted(fpl));
 4278 }
 4279 
 4280 /*
 4281  * The target vnode is not supported, prepare for the slow path to take over.
 4282  */
 4283 static int __noinline
 4284 cache_fplookup_partial_setup(struct cache_fpl *fpl)
 4285 {
 4286         struct nameidata *ndp;
 4287         struct componentname *cnp;
 4288         enum vgetstate dvs;
 4289         struct vnode *dvp;
 4290         struct pwd *pwd;
 4291         seqc_t dvp_seqc;
 4292 
 4293         ndp = fpl->ndp;
 4294         cnp = fpl->cnp;
 4295         pwd = *(fpl->pwd);
 4296         dvp = fpl->dvp;
 4297         dvp_seqc = fpl->dvp_seqc;
 4298 
 4299         if (!pwd_hold_smr(pwd)) {
 4300                 return (cache_fpl_aborted(fpl));
 4301         }
 4302 
 4303         /*
 4304          * Note that seqc is checked before the vnode is locked, so by
 4305          * the time regular lookup gets to it it may have moved.
 4306          *
 4307          * Ultimately this does not affect correctness, any lookup errors
 4308          * are userspace racing with itself. It is guaranteed that any
 4309          * path which ultimately gets found could also have been found
 4310          * by regular lookup going all the way in absence of concurrent
 4311          * modifications.
 4312          */
 4313         dvs = vget_prep_smr(dvp);
 4314         cache_fpl_smr_exit(fpl);
 4315         if (__predict_false(dvs == VGET_NONE)) {
 4316                 pwd_drop(pwd);
 4317                 return (cache_fpl_aborted(fpl));
 4318         }
 4319 
 4320         vget_finish_ref(dvp, dvs);
 4321         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4322                 vrele(dvp);
 4323                 pwd_drop(pwd);
 4324                 return (cache_fpl_aborted(fpl));
 4325         }
 4326 
 4327         cache_fpl_restore_partial(fpl);
 4328 #ifdef INVARIANTS
 4329         if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
 4330                 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
 4331                     cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
 4332         }
 4333 #endif
 4334 
 4335         ndp->ni_startdir = dvp;
 4336         cnp->cn_flags |= MAKEENTRY;
 4337         if (cache_fpl_islastcn(ndp))
 4338                 cnp->cn_flags |= ISLASTCN;
 4339         if (cache_fpl_isdotdot(cnp))
 4340                 cnp->cn_flags |= ISDOTDOT;
 4341 
 4342         /*
 4343          * Skip potential extra slashes parsing did not take care of.
 4344          * cache_fplookup_skip_slashes explains the mechanism.
 4345          */
 4346         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 4347                 do {
 4348                         cnp->cn_nameptr++;
 4349                         cache_fpl_pathlen_dec(fpl);
 4350                 } while (*(cnp->cn_nameptr) == '/');
 4351         }
 4352 
 4353         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 4354 #ifdef INVARIANTS
 4355         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 4356                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 4357                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 4358                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 4359         }
 4360 #endif
 4361         return (0);
 4362 }
 4363 
 4364 static int
 4365 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
 4366 {
 4367         struct componentname *cnp;
 4368         struct vnode *tvp;
 4369         seqc_t tvp_seqc;
 4370         int error, lkflags;
 4371 
 4372         cnp = fpl->cnp;
 4373         tvp = fpl->tvp;
 4374         tvp_seqc = fpl->tvp_seqc;
 4375 
 4376         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4377                 lkflags = LK_SHARED;
 4378                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4379                         lkflags = LK_EXCLUSIVE;
 4380                 error = vget_finish(tvp, lkflags, tvs);
 4381                 if (__predict_false(error != 0)) {
 4382                         return (cache_fpl_aborted(fpl));
 4383                 }
 4384         } else {
 4385                 vget_finish_ref(tvp, tvs);
 4386         }
 4387 
 4388         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 4389                 if ((cnp->cn_flags & LOCKLEAF) != 0)
 4390                         vput(tvp);
 4391                 else
 4392                         vrele(tvp);
 4393                 return (cache_fpl_aborted(fpl));
 4394         }
 4395 
 4396         return (cache_fpl_handled(fpl));
 4397 }
 4398 
 4399 /*
 4400  * They want to possibly modify the state of the namecache.
 4401  */
 4402 static int __noinline
 4403 cache_fplookup_final_modifying(struct cache_fpl *fpl)
 4404 {
 4405         struct nameidata *ndp;
 4406         struct componentname *cnp;
 4407         enum vgetstate dvs;
 4408         struct vnode *dvp, *tvp;
 4409         struct mount *mp;
 4410         seqc_t dvp_seqc;
 4411         int error;
 4412         bool docache;
 4413 
 4414         ndp = fpl->ndp;
 4415         cnp = fpl->cnp;
 4416         dvp = fpl->dvp;
 4417         dvp_seqc = fpl->dvp_seqc;
 4418 
 4419         MPASS(*(cnp->cn_nameptr) != '/');
 4420         MPASS(cache_fpl_islastcn(ndp));
 4421         if ((cnp->cn_flags & LOCKPARENT) == 0)
 4422                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
 4423         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
 4424         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
 4425             cnp->cn_nameiop == RENAME);
 4426         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4427         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4428 
 4429         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 4430         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
 4431                 docache = false;
 4432 
 4433         /*
 4434          * Regular lookup nulifies the slash, which we don't do here.
 4435          * Don't take chances with filesystem routines seeing it for
 4436          * the last entry.
 4437          */
 4438         if (cache_fpl_istrailingslash(fpl)) {
 4439                 return (cache_fpl_partial(fpl));
 4440         }
 4441 
 4442         mp = atomic_load_ptr(&dvp->v_mount);
 4443         if (__predict_false(mp == NULL)) {
 4444                 return (cache_fpl_aborted(fpl));
 4445         }
 4446 
 4447         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
 4448                 cache_fpl_smr_exit(fpl);
 4449                 /*
 4450                  * Original code keeps not checking for CREATE which
 4451                  * might be a bug. For now let the old lookup decide.
 4452                  */
 4453                 if (cnp->cn_nameiop == CREATE) {
 4454                         return (cache_fpl_aborted(fpl));
 4455                 }
 4456                 return (cache_fpl_handled_error(fpl, EROFS));
 4457         }
 4458 
 4459         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
 4460                 cache_fpl_smr_exit(fpl);
 4461                 return (cache_fpl_handled_error(fpl, EEXIST));
 4462         }
 4463 
 4464         /*
 4465          * Secure access to dvp; check cache_fplookup_partial_setup for
 4466          * reasoning.
 4467          *
 4468          * XXX At least UFS requires its lookup routine to be called for
 4469          * the last path component, which leads to some level of complication
 4470          * and inefficiency:
 4471          * - the target routine always locks the target vnode, but our caller
 4472          *   may not need it locked
 4473          * - some of the VOP machinery asserts that the parent is locked, which
 4474          *   once more may be not required
 4475          *
 4476          * TODO: add a flag for filesystems which don't need this.
 4477          */
 4478         dvs = vget_prep_smr(dvp);
 4479         cache_fpl_smr_exit(fpl);
 4480         if (__predict_false(dvs == VGET_NONE)) {
 4481                 return (cache_fpl_aborted(fpl));
 4482         }
 4483 
 4484         vget_finish_ref(dvp, dvs);
 4485         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4486                 vrele(dvp);
 4487                 return (cache_fpl_aborted(fpl));
 4488         }
 4489 
 4490         error = vn_lock(dvp, LK_EXCLUSIVE);
 4491         if (__predict_false(error != 0)) {
 4492                 vrele(dvp);
 4493                 return (cache_fpl_aborted(fpl));
 4494         }
 4495 
 4496         tvp = NULL;
 4497         cnp->cn_flags |= ISLASTCN;
 4498         if (docache)
 4499                 cnp->cn_flags |= MAKEENTRY;
 4500         if (cache_fpl_isdotdot(cnp))
 4501                 cnp->cn_flags |= ISDOTDOT;
 4502         cnp->cn_lkflags = LK_EXCLUSIVE;
 4503         error = VOP_LOOKUP(dvp, &tvp, cnp);
 4504         switch (error) {
 4505         case EJUSTRETURN:
 4506         case 0:
 4507                 break;
 4508         case ENOTDIR:
 4509         case ENOENT:
 4510                 vput(dvp);
 4511                 return (cache_fpl_handled_error(fpl, error));
 4512         default:
 4513                 vput(dvp);
 4514                 return (cache_fpl_aborted(fpl));
 4515         }
 4516 
 4517         fpl->tvp = tvp;
 4518         fpl->savename = (cnp->cn_flags & SAVENAME) != 0;
 4519 
 4520         if (tvp == NULL) {
 4521                 if ((cnp->cn_flags & SAVESTART) != 0) {
 4522                         ndp->ni_startdir = dvp;
 4523                         vrefact(ndp->ni_startdir);
 4524                         cnp->cn_flags |= SAVENAME;
 4525                         fpl->savename = true;
 4526                 }
 4527                 MPASS(error == EJUSTRETURN);
 4528                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4529                         VOP_UNLOCK(dvp);
 4530                 }
 4531                 return (cache_fpl_handled(fpl));
 4532         }
 4533 
 4534         /*
 4535          * There are very hairy corner cases concerning various flag combinations
 4536          * and locking state. In particular here we only hold one lock instead of
 4537          * two.
 4538          *
 4539          * Skip the complexity as it is of no significance for normal workloads.
 4540          */
 4541         if (__predict_false(tvp == dvp)) {
 4542                 vput(dvp);
 4543                 vrele(tvp);
 4544                 return (cache_fpl_aborted(fpl));
 4545         }
 4546 
 4547         /*
 4548          * If they want the symlink itself we are fine, but if they want to
 4549          * follow it regular lookup has to be engaged.
 4550          */
 4551         if (tvp->v_type == VLNK) {
 4552                 if ((cnp->cn_flags & FOLLOW) != 0) {
 4553                         vput(dvp);
 4554                         vput(tvp);
 4555                         return (cache_fpl_aborted(fpl));
 4556                 }
 4557         }
 4558 
 4559         /*
 4560          * Since we expect this to be the terminal vnode it should almost never
 4561          * be a mount point.
 4562          */
 4563         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 4564                 vput(dvp);
 4565                 vput(tvp);
 4566                 return (cache_fpl_aborted(fpl));
 4567         }
 4568 
 4569         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
 4570                 vput(dvp);
 4571                 vput(tvp);
 4572                 return (cache_fpl_handled_error(fpl, EEXIST));
 4573         }
 4574 
 4575         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 4576                 VOP_UNLOCK(tvp);
 4577         }
 4578 
 4579         if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4580                 VOP_UNLOCK(dvp);
 4581         }
 4582 
 4583         if ((cnp->cn_flags & SAVESTART) != 0) {
 4584                 ndp->ni_startdir = dvp;
 4585                 vrefact(ndp->ni_startdir);
 4586                 cnp->cn_flags |= SAVENAME;
 4587                 fpl->savename = true;
 4588         }
 4589 
 4590         return (cache_fpl_handled(fpl));
 4591 }
 4592 
 4593 static int __noinline
 4594 cache_fplookup_modifying(struct cache_fpl *fpl)
 4595 {
 4596         struct nameidata *ndp;
 4597 
 4598         ndp = fpl->ndp;
 4599 
 4600         if (!cache_fpl_islastcn(ndp)) {
 4601                 return (cache_fpl_partial(fpl));
 4602         }
 4603         return (cache_fplookup_final_modifying(fpl));
 4604 }
 4605 
 4606 static int __noinline
 4607 cache_fplookup_final_withparent(struct cache_fpl *fpl)
 4608 {
 4609         struct componentname *cnp;
 4610         enum vgetstate dvs, tvs;
 4611         struct vnode *dvp, *tvp;
 4612         seqc_t dvp_seqc;
 4613         int error;
 4614 
 4615         cnp = fpl->cnp;
 4616         dvp = fpl->dvp;
 4617         dvp_seqc = fpl->dvp_seqc;
 4618         tvp = fpl->tvp;
 4619 
 4620         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
 4621 
 4622         /*
 4623          * This is less efficient than it can be for simplicity.
 4624          */
 4625         dvs = vget_prep_smr(dvp);
 4626         if (__predict_false(dvs == VGET_NONE)) {
 4627                 return (cache_fpl_aborted(fpl));
 4628         }
 4629         tvs = vget_prep_smr(tvp);
 4630         if (__predict_false(tvs == VGET_NONE)) {
 4631                 cache_fpl_smr_exit(fpl);
 4632                 vget_abort(dvp, dvs);
 4633                 return (cache_fpl_aborted(fpl));
 4634         }
 4635 
 4636         cache_fpl_smr_exit(fpl);
 4637 
 4638         if ((cnp->cn_flags & LOCKPARENT) != 0) {
 4639                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
 4640                 if (__predict_false(error != 0)) {
 4641                         vget_abort(tvp, tvs);
 4642                         return (cache_fpl_aborted(fpl));
 4643                 }
 4644         } else {
 4645                 vget_finish_ref(dvp, dvs);
 4646         }
 4647 
 4648         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4649                 vget_abort(tvp, tvs);
 4650                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4651                         vput(dvp);
 4652                 else
 4653                         vrele(dvp);
 4654                 return (cache_fpl_aborted(fpl));
 4655         }
 4656 
 4657         error = cache_fplookup_final_child(fpl, tvs);
 4658         if (__predict_false(error != 0)) {
 4659                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
 4660                     fpl->status == CACHE_FPL_STATUS_DESTROYED);
 4661                 if ((cnp->cn_flags & LOCKPARENT) != 0)
 4662                         vput(dvp);
 4663                 else
 4664                         vrele(dvp);
 4665                 return (error);
 4666         }
 4667 
 4668         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
 4669         return (0);
 4670 }
 4671 
 4672 static int
 4673 cache_fplookup_final(struct cache_fpl *fpl)
 4674 {
 4675         struct componentname *cnp;
 4676         enum vgetstate tvs;
 4677         struct vnode *dvp, *tvp;
 4678         seqc_t dvp_seqc;
 4679 
 4680         cnp = fpl->cnp;
 4681         dvp = fpl->dvp;
 4682         dvp_seqc = fpl->dvp_seqc;
 4683         tvp = fpl->tvp;
 4684 
 4685         MPASS(*(cnp->cn_nameptr) != '/');
 4686 
 4687         if (cnp->cn_nameiop != LOOKUP) {
 4688                 return (cache_fplookup_final_modifying(fpl));
 4689         }
 4690 
 4691         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
 4692                 return (cache_fplookup_final_withparent(fpl));
 4693 
 4694         tvs = vget_prep_smr(tvp);
 4695         if (__predict_false(tvs == VGET_NONE)) {
 4696                 return (cache_fpl_partial(fpl));
 4697         }
 4698 
 4699         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4700                 cache_fpl_smr_exit(fpl);
 4701                 vget_abort(tvp, tvs);
 4702                 return (cache_fpl_aborted(fpl));
 4703         }
 4704 
 4705         cache_fpl_smr_exit(fpl);
 4706         return (cache_fplookup_final_child(fpl, tvs));
 4707 }
 4708 
 4709 /*
 4710  * Comment from locked lookup:
 4711  * Check for degenerate name (e.g. / or "") which is a way of talking about a
 4712  * directory, e.g. like "/." or ".".
 4713  */
 4714 static int __noinline
 4715 cache_fplookup_degenerate(struct cache_fpl *fpl)
 4716 {
 4717         struct componentname *cnp;
 4718         struct vnode *dvp;
 4719         enum vgetstate dvs;
 4720         int error, lkflags;
 4721 #ifdef INVARIANTS
 4722         char *cp;
 4723 #endif
 4724 
 4725         fpl->tvp = fpl->dvp;
 4726         fpl->tvp_seqc = fpl->dvp_seqc;
 4727 
 4728         cnp = fpl->cnp;
 4729         dvp = fpl->dvp;
 4730 
 4731 #ifdef INVARIANTS
 4732         for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
 4733                 KASSERT(*cp == '/',
 4734                     ("%s: encountered non-slash; string [%s]\n", __func__,
 4735                     cnp->cn_pnbuf));
 4736         }
 4737 #endif
 4738 
 4739         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
 4740                 cache_fpl_smr_exit(fpl);
 4741                 return (cache_fpl_handled_error(fpl, EISDIR));
 4742         }
 4743 
 4744         MPASS((cnp->cn_flags & SAVESTART) == 0);
 4745 
 4746         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
 4747                 return (cache_fplookup_final_withparent(fpl));
 4748         }
 4749 
 4750         dvs = vget_prep_smr(dvp);
 4751         cache_fpl_smr_exit(fpl);
 4752         if (__predict_false(dvs == VGET_NONE)) {
 4753                 return (cache_fpl_aborted(fpl));
 4754         }
 4755 
 4756         if ((cnp->cn_flags & LOCKLEAF) != 0) {
 4757                 lkflags = LK_SHARED;
 4758                 if ((cnp->cn_flags & LOCKSHARED) == 0)
 4759                         lkflags = LK_EXCLUSIVE;
 4760                 error = vget_finish(dvp, lkflags, dvs);
 4761                 if (__predict_false(error != 0)) {
 4762                         return (cache_fpl_aborted(fpl));
 4763                 }
 4764         } else {
 4765                 vget_finish_ref(dvp, dvs);
 4766         }
 4767         return (cache_fpl_handled(fpl));
 4768 }
 4769 
 4770 static int __noinline
 4771 cache_fplookup_noentry(struct cache_fpl *fpl)
 4772 {
 4773         struct nameidata *ndp;
 4774         struct componentname *cnp;
 4775         enum vgetstate dvs;
 4776         struct vnode *dvp, *tvp;
 4777         seqc_t dvp_seqc;
 4778         int error;
 4779         bool docache;
 4780 
 4781         ndp = fpl->ndp;
 4782         cnp = fpl->cnp;
 4783         dvp = fpl->dvp;
 4784         dvp_seqc = fpl->dvp_seqc;
 4785 
 4786         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 4787         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 4788         MPASS(!cache_fpl_isdotdot(cnp));
 4789 
 4790         /*
 4791          * Hack: delayed name len checking.
 4792          */
 4793         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 4794                 cache_fpl_smr_exit(fpl);
 4795                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 4796         }
 4797 
 4798         if (cnp->cn_nameptr[0] == '/') {
 4799                 return (cache_fplookup_skip_slashes(fpl));
 4800         }
 4801 
 4802         if (cnp->cn_nameptr[0] == '\0') {
 4803                 if (fpl->tvp == NULL) {
 4804                         return (cache_fplookup_degenerate(fpl));
 4805                 }
 4806                 return (cache_fplookup_trailingslash(fpl));
 4807         }
 4808 
 4809         if (cnp->cn_nameiop != LOOKUP) {
 4810                 fpl->tvp = NULL;
 4811                 return (cache_fplookup_modifying(fpl));
 4812         }
 4813 
 4814         MPASS((cnp->cn_flags & SAVESTART) == 0);
 4815 
 4816         /*
 4817          * Only try to fill in the component if it is the last one,
 4818          * otherwise not only there may be several to handle but the
 4819          * walk may be complicated.
 4820          */
 4821         if (!cache_fpl_islastcn(ndp)) {
 4822                 return (cache_fpl_partial(fpl));
 4823         }
 4824 
 4825         /*
 4826          * Regular lookup nulifies the slash, which we don't do here.
 4827          * Don't take chances with filesystem routines seeing it for
 4828          * the last entry.
 4829          */
 4830         if (cache_fpl_istrailingslash(fpl)) {
 4831                 return (cache_fpl_partial(fpl));
 4832         }
 4833 
 4834         /*
 4835          * Secure access to dvp; check cache_fplookup_partial_setup for
 4836          * reasoning.
 4837          */
 4838         dvs = vget_prep_smr(dvp);
 4839         cache_fpl_smr_exit(fpl);
 4840         if (__predict_false(dvs == VGET_NONE)) {
 4841                 return (cache_fpl_aborted(fpl));
 4842         }
 4843 
 4844         vget_finish_ref(dvp, dvs);
 4845         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 4846                 vrele(dvp);
 4847                 return (cache_fpl_aborted(fpl));
 4848         }
 4849 
 4850         error = vn_lock(dvp, LK_SHARED);
 4851         if (__predict_false(error != 0)) {
 4852                 vrele(dvp);
 4853                 return (cache_fpl_aborted(fpl));
 4854         }
 4855 
 4856         tvp = NULL;
 4857         /*
 4858          * TODO: provide variants which don't require locking either vnode.
 4859          */
 4860         cnp->cn_flags |= ISLASTCN;
 4861         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 4862         if (docache)
 4863                 cnp->cn_flags |= MAKEENTRY;
 4864         cnp->cn_lkflags = LK_SHARED;
 4865         if ((cnp->cn_flags & LOCKSHARED) == 0) {
 4866                 cnp->cn_lkflags = LK_EXCLUSIVE;
 4867         }
 4868         error = VOP_LOOKUP(dvp, &tvp, cnp);
 4869         switch (error) {
 4870         case EJUSTRETURN:
 4871         case 0:
 4872                 break;
 4873         case ENOTDIR:
 4874         case ENOENT:
 4875                 vput(dvp);
 4876                 return (cache_fpl_handled_error(fpl, error));
 4877         default:
 4878                 vput(dvp);
 4879                 return (cache_fpl_aborted(fpl));
 4880         }
 4881 
 4882         fpl->tvp = tvp;
 4883         if (!fpl->savename) {
 4884                 MPASS((cnp->cn_flags & SAVENAME) == 0);
 4885         }
 4886 
 4887         if (tvp == NULL) {
 4888                 MPASS(error == EJUSTRETURN);
 4889                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 4890                         vput(dvp);
 4891                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4892                         VOP_UNLOCK(dvp);
 4893                 }
 4894                 return (cache_fpl_handled(fpl));
 4895         }
 4896 
 4897         if (tvp->v_type == VLNK) {
 4898                 if ((cnp->cn_flags & FOLLOW) != 0) {
 4899                         vput(dvp);
 4900                         vput(tvp);
 4901                         return (cache_fpl_aborted(fpl));
 4902                 }
 4903         }
 4904 
 4905         if (__predict_false(cache_fplookup_is_mp(fpl))) {
 4906                 vput(dvp);
 4907                 vput(tvp);
 4908                 return (cache_fpl_aborted(fpl));
 4909         }
 4910 
 4911         if ((cnp->cn_flags & LOCKLEAF) == 0) {
 4912                 VOP_UNLOCK(tvp);
 4913         }
 4914 
 4915         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 4916                 vput(dvp);
 4917         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 4918                 VOP_UNLOCK(dvp);
 4919         }
 4920         return (cache_fpl_handled(fpl));
 4921 }
 4922 
 4923 static int __noinline
 4924 cache_fplookup_dot(struct cache_fpl *fpl)
 4925 {
 4926         int error;
 4927 
 4928         MPASS(!seqc_in_modify(fpl->dvp_seqc));
 4929         /*
 4930          * Just re-assign the value. seqc will be checked later for the first
 4931          * non-dot path component in line and/or before deciding to return the
 4932          * vnode.
 4933          */
 4934         fpl->tvp = fpl->dvp;
 4935         fpl->tvp_seqc = fpl->dvp_seqc;
 4936 
 4937         counter_u64_add(dothits, 1);
 4938         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
 4939 
 4940         error = 0;
 4941         if (cache_fplookup_is_mp(fpl)) {
 4942                 error = cache_fplookup_cross_mount(fpl);
 4943         }
 4944         return (error);
 4945 }
 4946 
 4947 static int __noinline
 4948 cache_fplookup_dotdot(struct cache_fpl *fpl)
 4949 {
 4950         struct nameidata *ndp;
 4951         struct componentname *cnp;
 4952         struct namecache *ncp;
 4953         struct vnode *dvp;
 4954         struct prison *pr;
 4955         u_char nc_flag;
 4956 
 4957         ndp = fpl->ndp;
 4958         cnp = fpl->cnp;
 4959         dvp = fpl->dvp;
 4960 
 4961         MPASS(cache_fpl_isdotdot(cnp));
 4962 
 4963         /*
 4964          * XXX this is racy the same way regular lookup is
 4965          */
 4966         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 4967             pr = pr->pr_parent)
 4968                 if (dvp == pr->pr_root)
 4969                         break;
 4970 
 4971         if (dvp == ndp->ni_rootdir ||
 4972             dvp == ndp->ni_topdir ||
 4973             dvp == rootvnode ||
 4974             pr != NULL) {
 4975                 fpl->tvp = dvp;
 4976                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
 4977                 if (seqc_in_modify(fpl->tvp_seqc)) {
 4978                         return (cache_fpl_aborted(fpl));
 4979                 }
 4980                 return (0);
 4981         }
 4982 
 4983         if ((dvp->v_vflag & VV_ROOT) != 0) {
 4984                 /*
 4985                  * TODO
 4986                  * The opposite of climb mount is needed here.
 4987                  */
 4988                 return (cache_fpl_partial(fpl));
 4989         }
 4990 
 4991         ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 4992         if (ncp == NULL) {
 4993                 return (cache_fpl_aborted(fpl));
 4994         }
 4995 
 4996         nc_flag = atomic_load_char(&ncp->nc_flag);
 4997         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 4998                 if ((nc_flag & NCF_NEGATIVE) != 0)
 4999                         return (cache_fpl_aborted(fpl));
 5000                 fpl->tvp = ncp->nc_vp;
 5001         } else {
 5002                 fpl->tvp = ncp->nc_dvp;
 5003         }
 5004 
 5005         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
 5006         if (seqc_in_modify(fpl->tvp_seqc)) {
 5007                 return (cache_fpl_partial(fpl));
 5008         }
 5009 
 5010         /*
 5011          * Acquire fence provided by vn_seqc_read_any above.
 5012          */
 5013         if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
 5014                 return (cache_fpl_aborted(fpl));
 5015         }
 5016 
 5017         if (!cache_ncp_canuse(ncp)) {
 5018                 return (cache_fpl_aborted(fpl));
 5019         }
 5020 
 5021         counter_u64_add(dotdothits, 1);
 5022         return (0);
 5023 }
 5024 
 5025 static int __noinline
 5026 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
 5027 {
 5028         u_char nc_flag;
 5029         bool neg_promote;
 5030 
 5031         nc_flag = atomic_load_char(&ncp->nc_flag);
 5032         MPASS((nc_flag & NCF_NEGATIVE) != 0);
 5033         /*
 5034          * If they want to create an entry we need to replace this one.
 5035          */
 5036         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
 5037                 fpl->tvp = NULL;
 5038                 return (cache_fplookup_modifying(fpl));
 5039         }
 5040         neg_promote = cache_neg_hit_prep(ncp);
 5041         if (!cache_fpl_neg_ncp_canuse(ncp)) {
 5042                 cache_neg_hit_abort(ncp);
 5043                 return (cache_fpl_partial(fpl));
 5044         }
 5045         if (neg_promote) {
 5046                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
 5047         }
 5048         cache_neg_hit_finish(ncp);
 5049         cache_fpl_smr_exit(fpl);
 5050         return (cache_fpl_handled_error(fpl, ENOENT));
 5051 }
 5052 
 5053 /*
 5054  * Resolve a symlink. Called by filesystem-specific routines.
 5055  *
 5056  * Code flow is:
 5057  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
 5058  */
 5059 int
 5060 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
 5061 {
 5062         struct nameidata *ndp;
 5063         struct componentname *cnp;
 5064         size_t adjust;
 5065 
 5066         ndp = fpl->ndp;
 5067         cnp = fpl->cnp;
 5068 
 5069         if (__predict_false(len == 0)) {
 5070                 return (ENOENT);
 5071         }
 5072 
 5073         if (__predict_false(len > MAXPATHLEN - 2)) {
 5074                 if (cache_fpl_istrailingslash(fpl)) {
 5075                         return (EAGAIN);
 5076                 }
 5077         }
 5078 
 5079         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
 5080 #ifdef INVARIANTS
 5081         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 5082                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 5083                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 5084                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 5085         }
 5086 #endif
 5087 
 5088         if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
 5089                 return (ENAMETOOLONG);
 5090         }
 5091 
 5092         if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
 5093                 return (ELOOP);
 5094         }
 5095 
 5096         adjust = len;
 5097         if (ndp->ni_pathlen > 1) {
 5098                 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
 5099         } else {
 5100                 if (cache_fpl_istrailingslash(fpl)) {
 5101                         adjust = len + 1;
 5102                         cnp->cn_pnbuf[len] = '/';
 5103                         cnp->cn_pnbuf[len + 1] = '\0';
 5104                 } else {
 5105                         cnp->cn_pnbuf[len] = '\0';
 5106                 }
 5107         }
 5108         bcopy(string, cnp->cn_pnbuf, len);
 5109 
 5110         ndp->ni_pathlen += adjust;
 5111         cache_fpl_pathlen_add(fpl, adjust);
 5112         cnp->cn_nameptr = cnp->cn_pnbuf;
 5113         fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 5114         fpl->tvp = NULL;
 5115         return (0);
 5116 }
 5117 
 5118 static int __noinline
 5119 cache_fplookup_symlink(struct cache_fpl *fpl)
 5120 {
 5121         struct mount *mp;
 5122         struct nameidata *ndp;
 5123         struct componentname *cnp;
 5124         struct vnode *dvp, *tvp;
 5125         int error;
 5126 
 5127         ndp = fpl->ndp;
 5128         cnp = fpl->cnp;
 5129         dvp = fpl->dvp;
 5130         tvp = fpl->tvp;
 5131 
 5132         if (cache_fpl_islastcn(ndp)) {
 5133                 if ((cnp->cn_flags & FOLLOW) == 0) {
 5134                         return (cache_fplookup_final(fpl));
 5135                 }
 5136         }
 5137 
 5138         mp = atomic_load_ptr(&dvp->v_mount);
 5139         if (__predict_false(mp == NULL)) {
 5140                 return (cache_fpl_aborted(fpl));
 5141         }
 5142 
 5143         /*
 5144          * Note this check races against setting the flag just like regular
 5145          * lookup.
 5146          */
 5147         if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
 5148                 cache_fpl_smr_exit(fpl);
 5149                 return (cache_fpl_handled_error(fpl, EACCES));
 5150         }
 5151 
 5152         error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
 5153         if (__predict_false(error != 0)) {
 5154                 switch (error) {
 5155                 case EAGAIN:
 5156                         return (cache_fpl_partial(fpl));
 5157                 case ENOENT:
 5158                 case ENAMETOOLONG:
 5159                 case ELOOP:
 5160                         cache_fpl_smr_exit(fpl);
 5161                         return (cache_fpl_handled_error(fpl, error));
 5162                 default:
 5163                         return (cache_fpl_aborted(fpl));
 5164                 }
 5165         }
 5166 
 5167         if (*(cnp->cn_nameptr) == '/') {
 5168                 fpl->dvp = cache_fpl_handle_root(fpl);
 5169                 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 5170                 if (seqc_in_modify(fpl->dvp_seqc)) {
 5171                         return (cache_fpl_aborted(fpl));
 5172                 }
 5173                 /*
 5174                  * The main loop assumes that ->dvp points to a vnode belonging
 5175                  * to a filesystem which can do lockless lookup, but the absolute
 5176                  * symlink can be wandering off to one which does not.
 5177                  */
 5178                 mp = atomic_load_ptr(&fpl->dvp->v_mount);
 5179                 if (__predict_false(mp == NULL)) {
 5180                         return (cache_fpl_aborted(fpl));
 5181                 }
 5182                 if (!cache_fplookup_mp_supported(mp)) {
 5183                         cache_fpl_checkpoint(fpl);
 5184                         return (cache_fpl_partial(fpl));
 5185                 }
 5186         }
 5187         return (0);
 5188 }
 5189 
 5190 static int
 5191 cache_fplookup_next(struct cache_fpl *fpl)
 5192 {
 5193         struct componentname *cnp;
 5194         struct namecache *ncp;
 5195         struct vnode *dvp, *tvp;
 5196         u_char nc_flag;
 5197         uint32_t hash;
 5198         int error;
 5199 
 5200         cnp = fpl->cnp;
 5201         dvp = fpl->dvp;
 5202         hash = fpl->hash;
 5203 
 5204         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 5205                 if (cnp->cn_namelen == 1) {
 5206                         return (cache_fplookup_dot(fpl));
 5207                 }
 5208                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 5209                         return (cache_fplookup_dotdot(fpl));
 5210                 }
 5211         }
 5212 
 5213         MPASS(!cache_fpl_isdotdot(cnp));
 5214 
 5215         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 5216                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 5217                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 5218                         break;
 5219         }
 5220 
 5221         if (__predict_false(ncp == NULL)) {
 5222                 return (cache_fplookup_noentry(fpl));
 5223         }
 5224 
 5225         tvp = atomic_load_ptr(&ncp->nc_vp);
 5226         nc_flag = atomic_load_char(&ncp->nc_flag);
 5227         if ((nc_flag & NCF_NEGATIVE) != 0) {
 5228                 return (cache_fplookup_neg(fpl, ncp, hash));
 5229         }
 5230 
 5231         if (!cache_ncp_canuse(ncp)) {
 5232                 return (cache_fpl_partial(fpl));
 5233         }
 5234 
 5235         fpl->tvp = tvp;
 5236         fpl->tvp_seqc = vn_seqc_read_any(tvp);
 5237         if (seqc_in_modify(fpl->tvp_seqc)) {
 5238                 return (cache_fpl_partial(fpl));
 5239         }
 5240 
 5241         counter_u64_add(numposhits, 1);
 5242         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
 5243 
 5244         error = 0;
 5245         if (cache_fplookup_is_mp(fpl)) {
 5246                 error = cache_fplookup_cross_mount(fpl);
 5247         }
 5248         return (error);
 5249 }
 5250 
 5251 static bool
 5252 cache_fplookup_mp_supported(struct mount *mp)
 5253 {
 5254 
 5255         MPASS(mp != NULL);
 5256         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
 5257                 return (false);
 5258         return (true);
 5259 }
 5260 
 5261 /*
 5262  * Walk up the mount stack (if any).
 5263  *
 5264  * Correctness is provided in the following ways:
 5265  * - all vnodes are protected from freeing with SMR
 5266  * - struct mount objects are type stable making them always safe to access
 5267  * - stability of the particular mount is provided by busying it
 5268  * - relationship between the vnode which is mounted on and the mount is
 5269  *   verified with the vnode sequence counter after busying
 5270  * - association between root vnode of the mount and the mount is protected
 5271  *   by busy
 5272  *
 5273  * From that point on we can read the sequence counter of the root vnode
 5274  * and get the next mount on the stack (if any) using the same protection.
 5275  *
 5276  * By the end of successful walk we are guaranteed the reached state was
 5277  * indeed present at least at some point which matches the regular lookup.
 5278  */
 5279 static int __noinline
 5280 cache_fplookup_climb_mount(struct cache_fpl *fpl)
 5281 {
 5282         struct mount *mp, *prev_mp;
 5283         struct mount_pcpu *mpcpu, *prev_mpcpu;
 5284         struct vnode *vp;
 5285         seqc_t vp_seqc;
 5286 
 5287         vp = fpl->tvp;
 5288         vp_seqc = fpl->tvp_seqc;
 5289 
 5290         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
 5291         mp = atomic_load_ptr(&vp->v_mountedhere);
 5292         if (__predict_false(mp == NULL)) {
 5293                 return (0);
 5294         }
 5295 
 5296         prev_mp = NULL;
 5297         for (;;) {
 5298                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5299                         if (prev_mp != NULL)
 5300                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5301                         return (cache_fpl_partial(fpl));
 5302                 }
 5303                 if (prev_mp != NULL)
 5304                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5305                 if (!vn_seqc_consistent(vp, vp_seqc)) {
 5306                         vfs_op_thread_exit_crit(mp, mpcpu);
 5307                         return (cache_fpl_partial(fpl));
 5308                 }
 5309                 if (!cache_fplookup_mp_supported(mp)) {
 5310                         vfs_op_thread_exit_crit(mp, mpcpu);
 5311                         return (cache_fpl_partial(fpl));
 5312                 }
 5313                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5314                 if (vp == NULL) {
 5315                         vfs_op_thread_exit_crit(mp, mpcpu);
 5316                         return (cache_fpl_partial(fpl));
 5317                 }
 5318                 vp_seqc = vn_seqc_read_any(vp);
 5319                 if (seqc_in_modify(vp_seqc)) {
 5320                         vfs_op_thread_exit_crit(mp, mpcpu);
 5321                         return (cache_fpl_partial(fpl));
 5322                 }
 5323                 prev_mp = mp;
 5324                 prev_mpcpu = mpcpu;
 5325                 mp = atomic_load_ptr(&vp->v_mountedhere);
 5326                 if (mp == NULL)
 5327                         break;
 5328         }
 5329 
 5330         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 5331         fpl->tvp = vp;
 5332         fpl->tvp_seqc = vp_seqc;
 5333         return (0);
 5334 }
 5335 
 5336 static int __noinline
 5337 cache_fplookup_cross_mount(struct cache_fpl *fpl)
 5338 {
 5339         struct mount *mp;
 5340         struct mount_pcpu *mpcpu;
 5341         struct vnode *vp;
 5342         seqc_t vp_seqc;
 5343 
 5344         vp = fpl->tvp;
 5345         vp_seqc = fpl->tvp_seqc;
 5346 
 5347         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
 5348         mp = atomic_load_ptr(&vp->v_mountedhere);
 5349         if (__predict_false(mp == NULL)) {
 5350                 return (0);
 5351         }
 5352 
 5353         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 5354                 return (cache_fpl_partial(fpl));
 5355         }
 5356         if (!vn_seqc_consistent(vp, vp_seqc)) {
 5357                 vfs_op_thread_exit_crit(mp, mpcpu);
 5358                 return (cache_fpl_partial(fpl));
 5359         }
 5360         if (!cache_fplookup_mp_supported(mp)) {
 5361                 vfs_op_thread_exit_crit(mp, mpcpu);
 5362                 return (cache_fpl_partial(fpl));
 5363         }
 5364         vp = atomic_load_ptr(&mp->mnt_rootvnode);
 5365         if (__predict_false(vp == NULL)) {
 5366                 vfs_op_thread_exit_crit(mp, mpcpu);
 5367                 return (cache_fpl_partial(fpl));
 5368         }
 5369         vp_seqc = vn_seqc_read_any(vp);
 5370         vfs_op_thread_exit_crit(mp, mpcpu);
 5371         if (seqc_in_modify(vp_seqc)) {
 5372                 return (cache_fpl_partial(fpl));
 5373         }
 5374         mp = atomic_load_ptr(&vp->v_mountedhere);
 5375         if (__predict_false(mp != NULL)) {
 5376                 /*
 5377                  * There are possibly more mount points on top.
 5378                  * Normally this does not happen so for simplicity just start
 5379                  * over.
 5380                  */
 5381                 return (cache_fplookup_climb_mount(fpl));
 5382         }
 5383 
 5384         fpl->tvp = vp;
 5385         fpl->tvp_seqc = vp_seqc;
 5386         return (0);
 5387 }
 5388 
 5389 /*
 5390  * Check if a vnode is mounted on.
 5391  */
 5392 static bool
 5393 cache_fplookup_is_mp(struct cache_fpl *fpl)
 5394 {
 5395         struct vnode *vp;
 5396 
 5397         vp = fpl->tvp;
 5398         return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
 5399 }
 5400 
 5401 /*
 5402  * Parse the path.
 5403  *
 5404  * The code was originally copy-pasted from regular lookup and despite
 5405  * clean ups leaves performance on the table. Any modifications here
 5406  * must take into account that in case off fallback the resulting
 5407  * nameidata state has to be compatible with the original.
 5408  */
 5409 
 5410 /*
 5411  * Debug ni_pathlen tracking.
 5412  */
 5413 #ifdef INVARIANTS
 5414 static void
 5415 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5416 {
 5417 
 5418         fpl->debug.ni_pathlen += n;
 5419         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5420             ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5421 }
 5422 
 5423 static void
 5424 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5425 {
 5426 
 5427         fpl->debug.ni_pathlen -= n;
 5428         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 5429             ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 5430 }
 5431 
 5432 static void
 5433 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5434 {
 5435 
 5436         cache_fpl_pathlen_add(fpl, 1);
 5437 }
 5438 
 5439 static void
 5440 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5441 {
 5442 
 5443         cache_fpl_pathlen_sub(fpl, 1);
 5444 }
 5445 #else
 5446 static void
 5447 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 5448 {
 5449 }
 5450 
 5451 static void
 5452 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 5453 {
 5454 }
 5455 
 5456 static void
 5457 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 5458 {
 5459 }
 5460 
 5461 static void
 5462 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 5463 {
 5464 }
 5465 #endif
 5466 
 5467 static void
 5468 cache_fplookup_parse(struct cache_fpl *fpl)
 5469 {
 5470         struct nameidata *ndp;
 5471         struct componentname *cnp;
 5472         struct vnode *dvp;
 5473         char *cp;
 5474         uint32_t hash;
 5475 
 5476         ndp = fpl->ndp;
 5477         cnp = fpl->cnp;
 5478         dvp = fpl->dvp;
 5479 
 5480         /*
 5481          * Find the end of this path component, it is either / or nul.
 5482          *
 5483          * Store / as a temporary sentinel so that we only have one character
 5484          * to test for. Pathnames tend to be short so this should not be
 5485          * resulting in cache misses.
 5486          *
 5487          * TODO: fix this to be word-sized.
 5488          */
 5489         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
 5490             ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
 5491             __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
 5492             fpl->nulchar, cnp->cn_pnbuf));
 5493         KASSERT(*fpl->nulchar == '\0',
 5494             ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
 5495             cnp->cn_pnbuf));
 5496         hash = cache_get_hash_iter_start(dvp);
 5497         *fpl->nulchar = '/';
 5498         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
 5499                 KASSERT(*cp != '\0',
 5500                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
 5501                     cnp->cn_nameptr));
 5502                 hash = cache_get_hash_iter(*cp, hash);
 5503                 continue;
 5504         }
 5505         *fpl->nulchar = '\0';
 5506         fpl->hash = cache_get_hash_iter_finish(hash);
 5507 
 5508         cnp->cn_namelen = cp - cnp->cn_nameptr;
 5509         cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
 5510 
 5511 #ifdef INVARIANTS
 5512         /*
 5513          * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
 5514          * we are going to fail this lookup with ENAMETOOLONG (see below).
 5515          */
 5516         if (cnp->cn_namelen <= NAME_MAX) {
 5517                 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
 5518                         panic("%s: mismatched hash for [%s] len %ld", __func__,
 5519                             cnp->cn_nameptr, cnp->cn_namelen);
 5520                 }
 5521         }
 5522 #endif
 5523 
 5524         /*
 5525          * Hack: we have to check if the found path component's length exceeds
 5526          * NAME_MAX. However, the condition is very rarely true and check can
 5527          * be elided in the common case -- if an entry was found in the cache,
 5528          * then it could not have been too long to begin with.
 5529          */
 5530         ndp->ni_next = cp;
 5531 }
 5532 
 5533 static void
 5534 cache_fplookup_parse_advance(struct cache_fpl *fpl)
 5535 {
 5536         struct nameidata *ndp;
 5537         struct componentname *cnp;
 5538 
 5539         ndp = fpl->ndp;
 5540         cnp = fpl->cnp;
 5541 
 5542         cnp->cn_nameptr = ndp->ni_next;
 5543         KASSERT(*(cnp->cn_nameptr) == '/',
 5544             ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
 5545             cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
 5546         cnp->cn_nameptr++;
 5547         cache_fpl_pathlen_dec(fpl);
 5548 }
 5549 
 5550 /*
 5551  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
 5552  *
 5553  * Lockless lookup tries to elide checking for spurious slashes and should they
 5554  * be present is guaranteed to fail to find an entry. In this case the caller
 5555  * must check if the name starts with a slash and call this routine.  It is
 5556  * going to fast forward across the spurious slashes and set the state up for
 5557  * retry.
 5558  */
 5559 static int __noinline
 5560 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
 5561 {
 5562         struct nameidata *ndp;
 5563         struct componentname *cnp;
 5564 
 5565         ndp = fpl->ndp;
 5566         cnp = fpl->cnp;
 5567 
 5568         MPASS(*(cnp->cn_nameptr) == '/');
 5569         do {
 5570                 cnp->cn_nameptr++;
 5571                 cache_fpl_pathlen_dec(fpl);
 5572         } while (*(cnp->cn_nameptr) == '/');
 5573 
 5574         /*
 5575          * Go back to one slash so that cache_fplookup_parse_advance has
 5576          * something to skip.
 5577          */
 5578         cnp->cn_nameptr--;
 5579         cache_fpl_pathlen_inc(fpl);
 5580 
 5581         /*
 5582          * cache_fplookup_parse_advance starts from ndp->ni_next
 5583          */
 5584         ndp->ni_next = cnp->cn_nameptr;
 5585 
 5586         /*
 5587          * See cache_fplookup_dot.
 5588          */
 5589         fpl->tvp = fpl->dvp;
 5590         fpl->tvp_seqc = fpl->dvp_seqc;
 5591 
 5592         return (0);
 5593 }
 5594 
 5595 /*
 5596  * Handle trailing slashes (e.g., "foo/").
 5597  *
 5598  * If a trailing slash is found the terminal vnode must be a directory.
 5599  * Regular lookup shortens the path by nulifying the first trailing slash and
 5600  * sets the TRAILINGSLASH flag to denote this took place. There are several
 5601  * checks on it performed later.
 5602  *
 5603  * Similarly to spurious slashes, lockless lookup handles this in a speculative
 5604  * manner relying on an invariant that a non-directory vnode will get a miss.
 5605  * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
 5606  *
 5607  * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
 5608  * and denotes this is the last path component, which avoids looping back.
 5609  *
 5610  * Only plain lookups are supported for now to restrict corner cases to handle.
 5611  */
 5612 static int __noinline
 5613 cache_fplookup_trailingslash(struct cache_fpl *fpl)
 5614 {
 5615 #ifdef INVARIANTS
 5616         size_t ni_pathlen;
 5617 #endif
 5618         struct nameidata *ndp;
 5619         struct componentname *cnp;
 5620         struct namecache *ncp;
 5621         struct vnode *tvp;
 5622         char *cn_nameptr_orig, *cn_nameptr_slash;
 5623         seqc_t tvp_seqc;
 5624         u_char nc_flag;
 5625 
 5626         ndp = fpl->ndp;
 5627         cnp = fpl->cnp;
 5628         tvp = fpl->tvp;
 5629         tvp_seqc = fpl->tvp_seqc;
 5630 
 5631         MPASS(fpl->dvp == fpl->tvp);
 5632         KASSERT(cache_fpl_istrailingslash(fpl),
 5633             ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
 5634             cnp->cn_pnbuf));
 5635         KASSERT(cnp->cn_nameptr[0] == '\0',
 5636             ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
 5637             cnp->cn_pnbuf));
 5638         KASSERT(cnp->cn_namelen == 0,
 5639             ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
 5640             cnp->cn_pnbuf));
 5641         MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
 5642 
 5643         if (cnp->cn_nameiop != LOOKUP) {
 5644                 return (cache_fpl_aborted(fpl));
 5645         }
 5646 
 5647         if (__predict_false(tvp->v_type != VDIR)) {
 5648                 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 5649                         return (cache_fpl_aborted(fpl));
 5650                 }
 5651                 cache_fpl_smr_exit(fpl);
 5652                 return (cache_fpl_handled_error(fpl, ENOTDIR));
 5653         }
 5654 
 5655         /*
 5656          * Denote the last component.
 5657          */
 5658         ndp->ni_next = &cnp->cn_nameptr[0];
 5659         MPASS(cache_fpl_islastcn(ndp));
 5660 
 5661         /*
 5662          * Unwind trailing slashes.
 5663          */
 5664         cn_nameptr_orig = cnp->cn_nameptr;
 5665         while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
 5666                 cnp->cn_nameptr--;
 5667                 if (cnp->cn_nameptr[0] != '/') {
 5668                         break;
 5669                 }
 5670         }
 5671 
 5672         /*
 5673          * Unwind to the beginning of the path component.
 5674          *
 5675          * Note the path may or may not have started with a slash.
 5676          */
 5677         cn_nameptr_slash = cnp->cn_nameptr;
 5678         while (cnp->cn_nameptr > cnp->cn_pnbuf) {
 5679                 cnp->cn_nameptr--;
 5680                 if (cnp->cn_nameptr[0] == '/') {
 5681                         break;
 5682                 }
 5683         }
 5684         if (cnp->cn_nameptr[0] == '/') {
 5685                 cnp->cn_nameptr++;
 5686         }
 5687 
 5688         cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
 5689         cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
 5690         cache_fpl_checkpoint(fpl);
 5691 
 5692 #ifdef INVARIANTS
 5693         ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 5694         if (ni_pathlen != fpl->debug.ni_pathlen) {
 5695                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 5696                     __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 5697                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 5698         }
 5699 #endif
 5700 
 5701         /*
 5702          * If this was a "./" lookup the parent directory is already correct.
 5703          */
 5704         if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
 5705                 return (0);
 5706         }
 5707 
 5708         /*
 5709          * Otherwise we need to look it up.
 5710          */
 5711         tvp = fpl->tvp;
 5712         ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
 5713         if (__predict_false(ncp == NULL)) {
 5714                 return (cache_fpl_aborted(fpl));
 5715         }
 5716         nc_flag = atomic_load_char(&ncp->nc_flag);
 5717         if ((nc_flag & NCF_ISDOTDOT) != 0) {
 5718                 return (cache_fpl_aborted(fpl));
 5719         }
 5720         fpl->dvp = ncp->nc_dvp;
 5721         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 5722         if (seqc_in_modify(fpl->dvp_seqc)) {
 5723                 return (cache_fpl_aborted(fpl));
 5724         }
 5725         return (0);
 5726 }
 5727 
 5728 /*
 5729  * See the API contract for VOP_FPLOOKUP_VEXEC.
 5730  */
 5731 static int __noinline
 5732 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
 5733 {
 5734         struct componentname *cnp;
 5735         struct vnode *dvp;
 5736         seqc_t dvp_seqc;
 5737 
 5738         cnp = fpl->cnp;
 5739         dvp = fpl->dvp;
 5740         dvp_seqc = fpl->dvp_seqc;
 5741 
 5742         /*
 5743          * TODO: Due to ignoring trailing slashes lookup will perform a
 5744          * permission check on the last dir when it should not be doing it.  It
 5745          * may fail, but said failure should be ignored. It is possible to fix
 5746          * it up fully without resorting to regular lookup, but for now just
 5747          * abort.
 5748          */
 5749         if (cache_fpl_istrailingslash(fpl)) {
 5750                 return (cache_fpl_aborted(fpl));
 5751         }
 5752 
 5753         /*
 5754          * Hack: delayed degenerate path checking.
 5755          */
 5756         if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
 5757                 return (cache_fplookup_degenerate(fpl));
 5758         }
 5759 
 5760         /*
 5761          * Hack: delayed name len checking.
 5762          */
 5763         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 5764                 cache_fpl_smr_exit(fpl);
 5765                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 5766         }
 5767 
 5768         /*
 5769          * Hack: they may be looking up foo/bar, where foo is not a directory.
 5770          * In such a case we need to return ENOTDIR, but we may happen to get
 5771          * here with a different error.
 5772          */
 5773         if (dvp->v_type != VDIR) {
 5774                 error = ENOTDIR;
 5775         }
 5776 
 5777         /*
 5778          * Hack: handle O_SEARCH.
 5779          *
 5780          * Open Group Base Specifications Issue 7, 2018 edition states:
 5781          * <quote>
 5782          * If the access mode of the open file description associated with the
 5783          * file descriptor is not O_SEARCH, the function shall check whether
 5784          * directory searches are permitted using the current permissions of
 5785          * the directory underlying the file descriptor. If the access mode is
 5786          * O_SEARCH, the function shall not perform the check.
 5787          * </quote>
 5788          *
 5789          * Regular lookup tests for the NOEXECCHECK flag for every path
 5790          * component to decide whether to do the permission check. However,
 5791          * since most lookups never have the flag (and when they do it is only
 5792          * present for the first path component), lockless lookup only acts on
 5793          * it if there is a permission problem. Here the flag is represented
 5794          * with a boolean so that we don't have to clear it on the way out.
 5795          *
 5796          * For simplicity this always aborts.
 5797          * TODO: check if this is the first lookup and ignore the permission
 5798          * problem. Note the flag has to survive fallback (if it happens to be
 5799          * performed).
 5800          */
 5801         if (fpl->fsearch) {
 5802                 return (cache_fpl_aborted(fpl));
 5803         }
 5804 
 5805         switch (error) {
 5806         case EAGAIN:
 5807                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5808                         error = cache_fpl_aborted(fpl);
 5809                 } else {
 5810                         cache_fpl_partial(fpl);
 5811                 }
 5812                 break;
 5813         default:
 5814                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 5815                         error = cache_fpl_aborted(fpl);
 5816                 } else {
 5817                         cache_fpl_smr_exit(fpl);
 5818                         cache_fpl_handled_error(fpl, error);
 5819                 }
 5820                 break;
 5821         }
 5822         return (error);
 5823 }
 5824 
 5825 static int
 5826 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 5827 {
 5828         struct nameidata *ndp;
 5829         struct componentname *cnp;
 5830         struct mount *mp;
 5831         int error;
 5832 
 5833         ndp = fpl->ndp;
 5834         cnp = fpl->cnp;
 5835 
 5836         cache_fpl_checkpoint(fpl);
 5837 
 5838         /*
 5839          * The vnode at hand is almost always stable, skip checking for it.
 5840          * Worst case this postpones the check towards the end of the iteration
 5841          * of the main loop.
 5842          */
 5843         fpl->dvp = dvp;
 5844         fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
 5845 
 5846         mp = atomic_load_ptr(&dvp->v_mount);
 5847         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
 5848                 return (cache_fpl_aborted(fpl));
 5849         }
 5850 
 5851         MPASS(fpl->tvp == NULL);
 5852 
 5853         for (;;) {
 5854                 cache_fplookup_parse(fpl);
 5855 
 5856                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
 5857                 if (__predict_false(error != 0)) {
 5858                         error = cache_fplookup_failed_vexec(fpl, error);
 5859                         break;
 5860                 }
 5861 
 5862                 error = cache_fplookup_next(fpl);
 5863                 if (__predict_false(cache_fpl_terminated(fpl))) {
 5864                         break;
 5865                 }
 5866 
 5867                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 5868 
 5869                 if (fpl->tvp->v_type == VLNK) {
 5870                         error = cache_fplookup_symlink(fpl);
 5871                         if (cache_fpl_terminated(fpl)) {
 5872                                 break;
 5873                         }
 5874                 } else {
 5875                         if (cache_fpl_islastcn(ndp)) {
 5876                                 error = cache_fplookup_final(fpl);
 5877                                 break;
 5878                         }
 5879 
 5880                         if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
 5881                                 error = cache_fpl_aborted(fpl);
 5882                                 break;
 5883                         }
 5884 
 5885                         fpl->dvp = fpl->tvp;
 5886                         fpl->dvp_seqc = fpl->tvp_seqc;
 5887                         cache_fplookup_parse_advance(fpl);
 5888                 }
 5889 
 5890                 cache_fpl_checkpoint(fpl);
 5891         }
 5892 
 5893         return (error);
 5894 }
 5895 
 5896 /*
 5897  * Fast path lookup protected with SMR and sequence counters.
 5898  *
 5899  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
 5900  *
 5901  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
 5902  * outlined below.
 5903  *
 5904  * Traditional vnode lookup conceptually looks like this:
 5905  *
 5906  * vn_lock(current);
 5907  * for (;;) {
 5908  *      next = find();
 5909  *      vn_lock(next);
 5910  *      vn_unlock(current);
 5911  *      current = next;
 5912  *      if (last)
 5913  *          break;
 5914  * }
 5915  * return (current);
 5916  *
 5917  * Each jump to the next vnode is safe memory-wise and atomic with respect to
 5918  * any modifications thanks to holding respective locks.
 5919  *
 5920  * The same guarantee can be provided with a combination of safe memory
 5921  * reclamation and sequence counters instead. If all operations which affect
 5922  * the relationship between the current vnode and the one we are looking for
 5923  * also modify the counter, we can verify whether all the conditions held as
 5924  * we made the jump. This includes things like permissions, mount points etc.
 5925  * Counter modification is provided by enclosing relevant places in
 5926  * vn_seqc_write_begin()/end() calls.
 5927  *
 5928  * Thus this translates to:
 5929  *
 5930  * vfs_smr_enter();
 5931  * dvp_seqc = seqc_read_any(dvp);
 5932  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
 5933  *     abort();
 5934  * for (;;) {
 5935  *      tvp = find();
 5936  *      tvp_seqc = seqc_read_any(tvp);
 5937  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
 5938  *          abort();
 5939  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
 5940  *          abort();
 5941  *      dvp = tvp; // we know nothing of importance has changed
 5942  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
 5943  *      if (last)
 5944  *          break;
 5945  * }
 5946  * vget(); // secure the vnode
 5947  * if (!seqc_consistent(tvp, tvp_seqc) // final check
 5948  *          abort();
 5949  * // at this point we know nothing has changed for any parent<->child pair
 5950  * // as they were crossed during the lookup, meaning we matched the guarantee
 5951  * // of the locked variant
 5952  * return (tvp);
 5953  *
 5954  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
 5955  * - they are called while within vfs_smr protection which they must never exit
 5956  * - EAGAIN can be returned to denote checking could not be performed, it is
 5957  *   always valid to return it
 5958  * - if the sequence counter has not changed the result must be valid
 5959  * - if the sequence counter has changed both false positives and false negatives
 5960  *   are permitted (since the result will be rejected later)
 5961  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
 5962  *
 5963  * Caveats to watch out for:
 5964  * - vnodes are passed unlocked and unreferenced with nothing stopping
 5965  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
 5966  *   to use atomic_load_ptr to fetch it.
 5967  * - the aforementioned object can also get freed, meaning absent other means it
 5968  *   should be protected with vfs_smr
 5969  * - either safely checking permissions as they are modified or guaranteeing
 5970  *   their stability is left to the routine
 5971  */
 5972 int
 5973 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
 5974     struct pwd **pwdp)
 5975 {
 5976         struct cache_fpl fpl;
 5977         struct pwd *pwd;
 5978         struct vnode *dvp;
 5979         struct componentname *cnp;
 5980         int error;
 5981 
 5982         fpl.status = CACHE_FPL_STATUS_UNSET;
 5983         fpl.in_smr = false;
 5984         fpl.ndp = ndp;
 5985         fpl.cnp = cnp = &ndp->ni_cnd;
 5986         MPASS(ndp->ni_lcf == 0);
 5987         MPASS(curthread == cnp->cn_thread);
 5988         KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 5989             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
 5990             cnp->cn_flags));
 5991         if ((cnp->cn_flags & SAVESTART) != 0) {
 5992                 MPASS(cnp->cn_nameiop != LOOKUP);
 5993         }
 5994         MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
 5995 
 5996         if (__predict_false(!cache_can_fplookup(&fpl))) {
 5997                 *status = fpl.status;
 5998                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 5999                 return (EOPNOTSUPP);
 6000         }
 6001 
 6002         cache_fpl_checkpoint_outer(&fpl);
 6003 
 6004         cache_fpl_smr_enter_initial(&fpl);
 6005 #ifdef INVARIANTS
 6006         fpl.debug.ni_pathlen = ndp->ni_pathlen;
 6007 #endif
 6008         fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 6009         fpl.fsearch = false;
 6010         fpl.savename = (cnp->cn_flags & SAVENAME) != 0;
 6011         fpl.tvp = NULL; /* for degenerate path handling */
 6012         fpl.pwd = pwdp;
 6013         pwd = pwd_get_smr();
 6014         *(fpl.pwd) = pwd;
 6015         ndp->ni_rootdir = pwd->pwd_rdir;
 6016         ndp->ni_topdir = pwd->pwd_jdir;
 6017 
 6018         if (cnp->cn_pnbuf[0] == '/') {
 6019                 dvp = cache_fpl_handle_root(&fpl);
 6020                 MPASS(ndp->ni_resflags == 0);
 6021                 ndp->ni_resflags = NIRES_ABS;
 6022         } else {
 6023                 if (ndp->ni_dirfd == AT_FDCWD) {
 6024                         dvp = pwd->pwd_cdir;
 6025                 } else {
 6026                         error = cache_fplookup_dirfd(&fpl, &dvp);
 6027                         if (__predict_false(error != 0)) {
 6028                                 goto out;
 6029                         }
 6030                 }
 6031         }
 6032 
 6033         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
 6034         error = cache_fplookup_impl(dvp, &fpl);
 6035 out:
 6036         cache_fpl_smr_assert_not_entered(&fpl);
 6037         cache_fpl_assert_status(&fpl);
 6038         *status = fpl.status;
 6039         if (SDT_PROBES_ENABLED()) {
 6040                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 6041                 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
 6042                         SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
 6043                             ndp);
 6044         }
 6045 
 6046         if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 6047                 MPASS(error != CACHE_FPL_FAILED);
 6048                 if (error != 0) {
 6049                         MPASS(fpl.dvp == NULL);
 6050                         MPASS(fpl.tvp == NULL);
 6051                         MPASS(fpl.savename == false);
 6052                 }
 6053                 ndp->ni_dvp = fpl.dvp;
 6054                 ndp->ni_vp = fpl.tvp;
 6055                 if (fpl.savename) {
 6056                         cnp->cn_flags |= HASBUF;
 6057                 } else {
 6058                         cache_fpl_cleanup_cnp(cnp);
 6059                 }
 6060         }
 6061         return (error);
 6062 }
Cache object: e4f584c9061261f7659c150f567d6ea0
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_cache.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c