The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993, 1995
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Poul-Henning Kamp of the FreeBSD Project.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_ddb.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/counter.h>
   46 #include <sys/filedesc.h>
   47 #include <sys/fnv_hash.h>
   48 #include <sys/kernel.h>
   49 #include <sys/lock.h>
   50 #include <sys/malloc.h>
   51 #include <sys/fcntl.h>
   52 #include <sys/mount.h>
   53 #include <sys/namei.h>
   54 #include <sys/proc.h>
   55 #include <sys/rwlock.h>
   56 #include <sys/sdt.h>
   57 #include <sys/smp.h>
   58 #include <sys/syscallsubr.h>
   59 #include <sys/sysctl.h>
   60 #include <sys/sysproto.h>
   61 #include <sys/vnode.h>
   62 #ifdef KTRACE
   63 #include <sys/ktrace.h>
   64 #endif
   65 
   66 #ifdef DDB
   67 #include <ddb/ddb.h>
   68 #endif
   69 
   70 #include <vm/uma.h>
   71 
   72 SDT_PROVIDER_DECLARE(vfs);
   73 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
   74     "struct vnode *");
   75 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
   76     "char *");
   77 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
   78 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
   79     "char *", "struct vnode *");
   80 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
   81 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
   82     "struct vnode *", "char *");
   83 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
   84     "struct vnode *");
   85 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
   86     "struct vnode *", "char *");
   87 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
   88     "char *");
   89 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
   90 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
   91 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
   92 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
   93     "struct vnode *");
   94 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
   95     "char *");
   96 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
   97     "char *");
   98 
   99 /*
  100  * This structure describes the elements in the cache of recent
  101  * names looked up by namei.
  102  */
  103 
  104 struct  namecache {
  105         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  106         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  107         LIST_ENTRY(namecache) nc_hash;  /* hash chain */
  108         struct  vnode *nc_dvp;          /* vnode of parent of name */
  109         union {
  110                 struct  vnode *nu_vp;   /* vnode the name refers to */
  111         } n_un;
  112         u_char  nc_flag;                /* flag bits */
  113         u_char  nc_nlen;                /* length of name */
  114         char    nc_name[0];             /* segment name + nul */
  115 };
  116 
  117 /*
  118  * struct namecache_ts repeats struct namecache layout up to the
  119  * nc_nlen member.
  120  * struct namecache_ts is used in place of struct namecache when time(s) need
  121  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  122  * both a non-dotdot directory name plus dotdot for the directory's
  123  * parent.
  124  *
  125  * See below for alignment requirement.
  126  */
  127 struct  namecache_ts {
  128         struct  timespec nc_time;       /* timespec provided by fs */
  129         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
  130         int     nc_ticks;               /* ticks value when entry was added */
  131         struct namecache nc_nc;
  132 };
  133 
  134 /*
  135  * At least mips n32 performs 64-bit accesses to timespec as found
  136  * in namecache_ts and requires them to be aligned. Since others
  137  * may be in the same spot suffer a little bit and enforce the
  138  * alignment for everyone. Note this is a nop for 64-bit platforms.
  139  */
  140 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
  141 
  142 #define nc_vp           n_un.nu_vp
  143 
  144 /*
  145  * Flags in namecache.nc_flag
  146  */
  147 #define NCF_WHITE       0x01
  148 #define NCF_ISDOTDOT    0x02
  149 #define NCF_TS          0x04
  150 #define NCF_DTS         0x08
  151 #define NCF_DVDROP      0x10
  152 #define NCF_NEGATIVE    0x20
  153 #define NCF_HOTNEGATIVE 0x40
  154 
  155 /*
  156  * Name caching works as follows:
  157  *
  158  * Names found by directory scans are retained in a cache
  159  * for future reference.  It is managed LRU, so frequently
  160  * used names will hang around.  Cache is indexed by hash value
  161  * obtained from (dvp, name) where dvp refers to the directory
  162  * containing name.
  163  *
  164  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  165  * exist) the vnode pointer will be NULL.
  166  *
  167  * Upon reaching the last segment of a path, if the reference
  168  * is for DELETE, or NOCACHE is set (rewrite), and the
  169  * name is located in the cache, it will be dropped.
  170  *
  171  * These locks are used (in the order in which they can be taken):
  172  * NAME         TYPE    ROLE
  173  * vnodelock    mtx     vnode lists and v_cache_dd field protection
  174  * bucketlock   rwlock  for access to given set of hash buckets
  175  * neglist      mtx     negative entry LRU management
  176  *
  177  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
  178  * shrinking the LRU list.
  179  *
  180  * It is legal to take multiple vnodelock and bucketlock locks. The locking
  181  * order is lower address first. Both are recursive.
  182  *
  183  * "." lookups are lockless.
  184  *
  185  * ".." and vnode -> name lookups require vnodelock.
  186  *
  187  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
  188  *
  189  * Insertions and removals of entries require involved vnodes and bucketlocks
  190  * to be write-locked to prevent other threads from seeing the entry.
  191  *
  192  * Some lookups result in removal of the found entry (e.g. getting rid of a
  193  * negative entry with the intent to create a positive one), which poses a
  194  * problem when multiple threads reach the state. Similarly, two different
  195  * threads can purge two different vnodes and try to remove the same name.
  196  *
  197  * If the already held vnode lock is lower than the second required lock, we
  198  * can just take the other lock. However, in the opposite case, this could
  199  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
  200  * the first node, locking everything in order and revalidating the state.
  201  */
  202 
  203 /*
  204  * Structures associated with name caching.
  205  */
  206 #define NCHHASH(hash) \
  207         (&nchashtbl[(hash) & nchash])
  208 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
  209 static u_long __read_mostly     nchash;                 /* size of hash table */
  210 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  211     "Size of namecache hash table");
  212 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
  213 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
  214     "Ratio of negative namecache entries");
  215 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
  216 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
  217 u_int ncsizefactor = 2;
  218 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
  219     "Size factor for namecache");
  220 static u_int __read_mostly      ncpurgeminvnodes;
  221 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
  222     "Number of vnodes below which purgevfs ignores the request");
  223 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
  224 
  225 struct nchstats nchstats;               /* cache effectiveness statistics */
  226 
  227 static struct mtx __exclusive_cache_line        ncneg_shrink_lock;
  228 static int      shrink_list_turn;
  229 
  230 struct neglist {
  231         struct mtx              nl_lock;
  232         TAILQ_HEAD(, namecache) nl_list;
  233 } __aligned(CACHE_LINE_SIZE);
  234 
  235 static struct neglist __read_mostly     *neglists;
  236 static struct neglist ncneg_hot;
  237 static u_long numhotneg;
  238 
  239 #define ncneghash       3
  240 #define numneglists     (ncneghash + 1)
  241 static inline struct neglist *
  242 NCP2NEGLIST(struct namecache *ncp)
  243 {
  244 
  245         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
  246 }
  247 
  248 #define numbucketlocks (ncbuckethash + 1)
  249 static u_int __read_mostly  ncbuckethash;
  250 static struct rwlock_padalign __read_mostly  *bucketlocks;
  251 #define HASH2BUCKETLOCK(hash) \
  252         ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
  253 
  254 #define numvnodelocks (ncvnodehash + 1)
  255 static u_int __read_mostly  ncvnodehash;
  256 static struct mtx __read_mostly *vnodelocks;
  257 static inline struct mtx *
  258 VP2VNODELOCK(struct vnode *vp)
  259 {
  260 
  261         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
  262 }
  263 
  264 /*
  265  * UMA zones for the VFS cache.
  266  *
  267  * The small cache is used for entries with short names, which are the
  268  * most common.  The large cache is used for entries which are too big to
  269  * fit in the small cache.
  270  */
  271 static uma_zone_t __read_mostly cache_zone_small;
  272 static uma_zone_t __read_mostly cache_zone_small_ts;
  273 static uma_zone_t __read_mostly cache_zone_large;
  274 static uma_zone_t __read_mostly cache_zone_large_ts;
  275 
  276 #define CACHE_PATH_CUTOFF       35
  277 
  278 static struct namecache *
  279 cache_alloc(int len, int ts)
  280 {
  281         struct namecache_ts *ncp_ts;
  282         struct namecache *ncp;
  283 
  284         if (__predict_false(ts)) {
  285                 if (len <= CACHE_PATH_CUTOFF)
  286                         ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK);
  287                 else
  288                         ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK);
  289                 ncp = &ncp_ts->nc_nc;
  290         } else {
  291                 if (len <= CACHE_PATH_CUTOFF)
  292                         ncp = uma_zalloc(cache_zone_small, M_WAITOK);
  293                 else
  294                         ncp = uma_zalloc(cache_zone_large, M_WAITOK);
  295         }
  296         return (ncp);
  297 }
  298 
  299 static void
  300 cache_free(struct namecache *ncp)
  301 {
  302         struct namecache_ts *ncp_ts;
  303 
  304         if (ncp == NULL)
  305                 return;
  306         if ((ncp->nc_flag & NCF_DVDROP) != 0)
  307                 vdrop(ncp->nc_dvp);
  308         if (__predict_false(ncp->nc_flag & NCF_TS)) {
  309                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  310                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  311                         uma_zfree(cache_zone_small_ts, ncp_ts);
  312                 else
  313                         uma_zfree(cache_zone_large_ts, ncp_ts);
  314         } else {
  315                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  316                         uma_zfree(cache_zone_small, ncp);
  317                 else
  318                         uma_zfree(cache_zone_large, ncp);
  319         }
  320 }
  321 
  322 static void
  323 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
  324 {
  325         struct namecache_ts *ncp_ts;
  326 
  327         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
  328             (tsp == NULL && ticksp == NULL),
  329             ("No NCF_TS"));
  330 
  331         if (tsp == NULL && ticksp == NULL)
  332                 return;
  333 
  334         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  335         if (tsp != NULL)
  336                 *tsp = ncp_ts->nc_time;
  337         if (ticksp != NULL)
  338                 *ticksp = ncp_ts->nc_ticks;
  339 }
  340 
  341 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
  342 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
  343     "VFS namecache enabled");
  344 
  345 /* Export size information to userland */
  346 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
  347     sizeof(struct namecache), "sizeof(struct namecache)");
  348 
  349 /*
  350  * The new name cache statistics
  351  */
  352 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
  353     "Name cache statistics");
  354 #define STATNODE_ULONG(name, descr)     \
  355         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
  356 #define STATNODE_COUNTER(name, descr)   \
  357         static counter_u64_t __read_mostly name; \
  358         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
  359 STATNODE_ULONG(numneg, "Number of negative cache entries");
  360 STATNODE_ULONG(numcache, "Number of cache entries");
  361 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
  362 STATNODE_COUNTER(numcalls, "Number of cache lookups");
  363 STATNODE_COUNTER(dothits, "Number of '.' hits");
  364 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
  365 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
  366 STATNODE_COUNTER(nummiss, "Number of cache misses");
  367 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
  368 STATNODE_COUNTER(numposzaps,
  369     "Number of cache hits (positive) we do not want to cache");
  370 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
  371 STATNODE_COUNTER(numnegzaps,
  372     "Number of cache hits (negative) we do not want to cache");
  373 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
  374 /* These count for kern___getcwd(), too. */
  375 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
  376 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
  377 STATNODE_COUNTER(numfullpathfail2,
  378     "Number of fullpath search errors (VOP_VPTOCNP failures)");
  379 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
  380 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
  381 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
  382     "Number of successful removals after relocking");
  383 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
  384     "Number of times zap_and_exit failed to lock");
  385 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
  386     "Number of times zap_and_exit failed to lock");
  387 static long cache_lock_vnodes_cel_3_failures;
  388 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
  389     "Number of times 3-way vnode locking failed");
  390 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
  391 STATNODE_COUNTER(numneg_evicted,
  392     "Number of negative entries evicted when adding a new entry");
  393 STATNODE_COUNTER(shrinking_skipped,
  394     "Number of times shrinking was already in progress");
  395 
  396 static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
  397 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
  398     char *buf, char **retbuf, u_int buflen);
  399 
  400 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  401 
  402 static int cache_yield;
  403 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
  404     "Number of times cache called yield");
  405 
  406 static void __noinline
  407 cache_maybe_yield(void)
  408 {
  409 
  410         if (should_yield()) {
  411                 cache_yield++;
  412                 kern_yield(PRI_USER);
  413         }
  414 }
  415 
  416 static inline void
  417 cache_assert_vlp_locked(struct mtx *vlp)
  418 {
  419 
  420         if (vlp != NULL)
  421                 mtx_assert(vlp, MA_OWNED);
  422 }
  423 
  424 static inline void
  425 cache_assert_vnode_locked(struct vnode *vp)
  426 {
  427         struct mtx *vlp;
  428 
  429         vlp = VP2VNODELOCK(vp);
  430         cache_assert_vlp_locked(vlp);
  431 }
  432 
  433 static uint32_t
  434 cache_get_hash(char *name, u_char len, struct vnode *dvp)
  435 {
  436         uint32_t hash;
  437 
  438         hash = fnv_32_buf(name, len, FNV1_32_INIT);
  439         hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
  440         return (hash);
  441 }
  442 
  443 static inline struct rwlock *
  444 NCP2BUCKETLOCK(struct namecache *ncp)
  445 {
  446         uint32_t hash;
  447 
  448         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  449         return (HASH2BUCKETLOCK(hash));
  450 }
  451 
  452 #ifdef INVARIANTS
  453 static void
  454 cache_assert_bucket_locked(struct namecache *ncp, int mode)
  455 {
  456         struct rwlock *blp;
  457 
  458         blp = NCP2BUCKETLOCK(ncp);
  459         rw_assert(blp, mode);
  460 }
  461 #else
  462 #define cache_assert_bucket_locked(x, y) do { } while (0)
  463 #endif
  464 
  465 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
  466 static void
  467 _cache_sort_vnodes(void **p1, void **p2)
  468 {
  469         void *tmp;
  470 
  471         MPASS(*p1 != NULL || *p2 != NULL);
  472 
  473         if (*p1 > *p2) {
  474                 tmp = *p2;
  475                 *p2 = *p1;
  476                 *p1 = tmp;
  477         }
  478 }
  479 
  480 static void
  481 cache_lock_all_buckets(void)
  482 {
  483         u_int i;
  484 
  485         for (i = 0; i < numbucketlocks; i++)
  486                 rw_wlock(&bucketlocks[i]);
  487 }
  488 
  489 static void
  490 cache_unlock_all_buckets(void)
  491 {
  492         u_int i;
  493 
  494         for (i = 0; i < numbucketlocks; i++)
  495                 rw_wunlock(&bucketlocks[i]);
  496 }
  497 
  498 static void
  499 cache_lock_all_vnodes(void)
  500 {
  501         u_int i;
  502 
  503         for (i = 0; i < numvnodelocks; i++)
  504                 mtx_lock(&vnodelocks[i]);
  505 }
  506 
  507 static void
  508 cache_unlock_all_vnodes(void)
  509 {
  510         u_int i;
  511 
  512         for (i = 0; i < numvnodelocks; i++)
  513                 mtx_unlock(&vnodelocks[i]);
  514 }
  515 
  516 static int
  517 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  518 {
  519 
  520         cache_sort_vnodes(&vlp1, &vlp2);
  521 
  522         if (vlp1 != NULL) {
  523                 if (!mtx_trylock(vlp1))
  524                         return (EAGAIN);
  525         }
  526         if (!mtx_trylock(vlp2)) {
  527                 if (vlp1 != NULL)
  528                         mtx_unlock(vlp1);
  529                 return (EAGAIN);
  530         }
  531 
  532         return (0);
  533 }
  534 
  535 static void
  536 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  537 {
  538 
  539         MPASS(vlp1 != NULL || vlp2 != NULL);
  540         MPASS(vlp1 <= vlp2);
  541 
  542         if (vlp1 != NULL)
  543                 mtx_lock(vlp1);
  544         if (vlp2 != NULL)
  545                 mtx_lock(vlp2);
  546 }
  547 
  548 static void
  549 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  550 {
  551 
  552         MPASS(vlp1 != NULL || vlp2 != NULL);
  553 
  554         if (vlp1 != NULL)
  555                 mtx_unlock(vlp1);
  556         if (vlp2 != NULL)
  557                 mtx_unlock(vlp2);
  558 }
  559 
  560 static int
  561 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  562 {
  563         struct nchstats snap;
  564 
  565         if (req->oldptr == NULL)
  566                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
  567 
  568         snap = nchstats;
  569         snap.ncs_goodhits = counter_u64_fetch(numposhits);
  570         snap.ncs_neghits = counter_u64_fetch(numneghits);
  571         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
  572             counter_u64_fetch(numnegzaps);
  573         snap.ncs_miss = counter_u64_fetch(nummisszap) +
  574             counter_u64_fetch(nummiss);
  575 
  576         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
  577 }
  578 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
  579     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
  580     "VFS cache effectiveness statistics");
  581 
  582 #ifdef DIAGNOSTIC
  583 /*
  584  * Grab an atomic snapshot of the name cache hash chain lengths
  585  */
  586 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
  587     "hash table stats");
  588 
  589 static int
  590 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
  591 {
  592         struct nchashhead *ncpp;
  593         struct namecache *ncp;
  594         int i, error, n_nchash, *cntbuf;
  595 
  596 retry:
  597         n_nchash = nchash + 1;  /* nchash is max index, not count */
  598         if (req->oldptr == NULL)
  599                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
  600         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
  601         cache_lock_all_buckets();
  602         if (n_nchash != nchash + 1) {
  603                 cache_unlock_all_buckets();
  604                 free(cntbuf, M_TEMP);
  605                 goto retry;
  606         }
  607         /* Scan hash tables counting entries */
  608         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
  609                 LIST_FOREACH(ncp, ncpp, nc_hash)
  610                         cntbuf[i]++;
  611         cache_unlock_all_buckets();
  612         for (error = 0, i = 0; i < n_nchash; i++)
  613                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
  614                         break;
  615         free(cntbuf, M_TEMP);
  616         return (error);
  617 }
  618 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
  619     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
  620     "nchash chain lengths");
  621 
  622 static int
  623 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
  624 {
  625         int error;
  626         struct nchashhead *ncpp;
  627         struct namecache *ncp;
  628         int n_nchash;
  629         int count, maxlength, used, pct;
  630 
  631         if (!req->oldptr)
  632                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
  633 
  634         cache_lock_all_buckets();
  635         n_nchash = nchash + 1;  /* nchash is max index, not count */
  636         used = 0;
  637         maxlength = 0;
  638 
  639         /* Scan hash tables for applicable entries */
  640         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
  641                 count = 0;
  642                 LIST_FOREACH(ncp, ncpp, nc_hash) {
  643                         count++;
  644                 }
  645                 if (count)
  646                         used++;
  647                 if (maxlength < count)
  648                         maxlength = count;
  649         }
  650         n_nchash = nchash + 1;
  651         cache_unlock_all_buckets();
  652         pct = (used * 100) / (n_nchash / 100);
  653         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
  654         if (error)
  655                 return (error);
  656         error = SYSCTL_OUT(req, &used, sizeof(used));
  657         if (error)
  658                 return (error);
  659         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
  660         if (error)
  661                 return (error);
  662         error = SYSCTL_OUT(req, &pct, sizeof(pct));
  663         if (error)
  664                 return (error);
  665         return (0);
  666 }
  667 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
  668     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
  669     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
  670 #endif
  671 
  672 /*
  673  * Negative entries management
  674  *
  675  * A variation of LRU scheme is used. New entries are hashed into one of
  676  * numneglists cold lists. Entries get promoted to the hot list on first hit.
  677  *
  678  * The shrinker will demote hot list head and evict from the cold list in a
  679  * round-robin manner.
  680  */
  681 static void
  682 cache_negative_hit(struct namecache *ncp)
  683 {
  684         struct neglist *neglist;
  685 
  686         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  687         if (ncp->nc_flag & NCF_HOTNEGATIVE)
  688                 return;
  689         neglist = NCP2NEGLIST(ncp);
  690         mtx_lock(&ncneg_hot.nl_lock);
  691         mtx_lock(&neglist->nl_lock);
  692         if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
  693                 numhotneg++;
  694                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
  695                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
  696                 ncp->nc_flag |= NCF_HOTNEGATIVE;
  697         }
  698         mtx_unlock(&neglist->nl_lock);
  699         mtx_unlock(&ncneg_hot.nl_lock);
  700 }
  701 
  702 static void
  703 cache_negative_insert(struct namecache *ncp, bool neg_locked)
  704 {
  705         struct neglist *neglist;
  706 
  707         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  708         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  709         neglist = NCP2NEGLIST(ncp);
  710         if (!neg_locked) {
  711                 mtx_lock(&neglist->nl_lock);
  712         } else {
  713                 mtx_assert(&neglist->nl_lock, MA_OWNED);
  714         }
  715         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
  716         if (!neg_locked)
  717                 mtx_unlock(&neglist->nl_lock);
  718         atomic_add_rel_long(&numneg, 1);
  719 }
  720 
  721 static void
  722 cache_negative_remove(struct namecache *ncp, bool neg_locked)
  723 {
  724         struct neglist *neglist;
  725         bool hot_locked = false;
  726         bool list_locked = false;
  727 
  728         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  729         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  730         neglist = NCP2NEGLIST(ncp);
  731         if (!neg_locked) {
  732                 if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  733                         hot_locked = true;
  734                         mtx_lock(&ncneg_hot.nl_lock);
  735                         if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
  736                                 list_locked = true;
  737                                 mtx_lock(&neglist->nl_lock);
  738                         }
  739                 } else {
  740                         list_locked = true;
  741                         mtx_lock(&neglist->nl_lock);
  742                 }
  743         }
  744         if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  745                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
  746                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  747                 numhotneg--;
  748         } else {
  749                 mtx_assert(&neglist->nl_lock, MA_OWNED);
  750                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
  751         }
  752         if (list_locked)
  753                 mtx_unlock(&neglist->nl_lock);
  754         if (hot_locked)
  755                 mtx_unlock(&ncneg_hot.nl_lock);
  756         atomic_subtract_rel_long(&numneg, 1);
  757 }
  758 
  759 static void
  760 cache_negative_shrink_select(int start, struct namecache **ncpp,
  761     struct neglist **neglistpp)
  762 {
  763         struct neglist *neglist;
  764         struct namecache *ncp;
  765         int i;
  766 
  767         *ncpp = ncp = NULL;
  768         neglist = NULL;
  769 
  770         for (i = start; i < numneglists; i++) {
  771                 neglist = &neglists[i];
  772                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
  773                         continue;
  774                 mtx_lock(&neglist->nl_lock);
  775                 ncp = TAILQ_FIRST(&neglist->nl_list);
  776                 if (ncp != NULL)
  777                         break;
  778                 mtx_unlock(&neglist->nl_lock);
  779         }
  780 
  781         *neglistpp = neglist;
  782         *ncpp = ncp;
  783 }
  784 
  785 static void
  786 cache_negative_zap_one(void)
  787 {
  788         struct namecache *ncp, *ncp2;
  789         struct neglist *neglist;
  790         struct mtx *dvlp;
  791         struct rwlock *blp;
  792 
  793         if (mtx_owner(&ncneg_shrink_lock) != NULL ||
  794             !mtx_trylock(&ncneg_shrink_lock)) {
  795                 counter_u64_add(shrinking_skipped, 1);
  796                 return;
  797         }
  798 
  799         mtx_lock(&ncneg_hot.nl_lock);
  800         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
  801         if (ncp != NULL) {
  802                 neglist = NCP2NEGLIST(ncp);
  803                 mtx_lock(&neglist->nl_lock);
  804                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  805                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
  806                 ncp->nc_flag &= ~NCF_HOTNEGATIVE;
  807                 numhotneg--;
  808                 mtx_unlock(&neglist->nl_lock);
  809         }
  810         mtx_unlock(&ncneg_hot.nl_lock);
  811 
  812         cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
  813         shrink_list_turn++;
  814         if (shrink_list_turn == numneglists)
  815                 shrink_list_turn = 0;
  816         if (ncp == NULL && shrink_list_turn == 0)
  817                 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
  818         mtx_unlock(&ncneg_shrink_lock);
  819         if (ncp == NULL)
  820                 return;
  821 
  822         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  823         dvlp = VP2VNODELOCK(ncp->nc_dvp);
  824         blp = NCP2BUCKETLOCK(ncp);
  825         mtx_unlock(&neglist->nl_lock);
  826         mtx_lock(dvlp);
  827         rw_wlock(blp);
  828         mtx_lock(&neglist->nl_lock);
  829         ncp2 = TAILQ_FIRST(&neglist->nl_list);
  830         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
  831             blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
  832                 ncp = NULL;
  833         } else {
  834                 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
  835                     ncp->nc_name);
  836 
  837                 cache_zap_locked(ncp, true);
  838                 counter_u64_add(numneg_evicted, 1);
  839         }
  840         mtx_unlock(&neglist->nl_lock);
  841         rw_wunlock(blp);
  842         mtx_unlock(dvlp);
  843         cache_free(ncp);
  844 }
  845 
  846 /*
  847  * cache_zap_locked():
  848  *
  849  *   Removes a namecache entry from cache, whether it contains an actual
  850  *   pointer to a vnode or if it is just a negative cache entry.
  851  */
  852 static void
  853 cache_zap_locked(struct namecache *ncp, bool neg_locked)
  854 {
  855 
  856         if (!(ncp->nc_flag & NCF_NEGATIVE))
  857                 cache_assert_vnode_locked(ncp->nc_vp);
  858         cache_assert_vnode_locked(ncp->nc_dvp);
  859         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  860 
  861         CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
  862             (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
  863         LIST_REMOVE(ncp, nc_hash);
  864         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
  865                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
  866                     ncp->nc_name, ncp->nc_vp);
  867                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
  868                 if (ncp == ncp->nc_vp->v_cache_dd)
  869                         ncp->nc_vp->v_cache_dd = NULL;
  870         } else {
  871                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
  872                     ncp->nc_name);
  873                 cache_negative_remove(ncp, neg_locked);
  874         }
  875         if (ncp->nc_flag & NCF_ISDOTDOT) {
  876                 if (ncp == ncp->nc_dvp->v_cache_dd)
  877                         ncp->nc_dvp->v_cache_dd = NULL;
  878         } else {
  879                 LIST_REMOVE(ncp, nc_src);
  880                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
  881                         ncp->nc_flag |= NCF_DVDROP;
  882                         counter_u64_add(numcachehv, -1);
  883                 }
  884         }
  885         atomic_subtract_rel_long(&numcache, 1);
  886 }
  887 
  888 static void
  889 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
  890 {
  891         struct rwlock *blp;
  892 
  893         MPASS(ncp->nc_dvp == vp);
  894         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  895         cache_assert_vnode_locked(vp);
  896 
  897         blp = NCP2BUCKETLOCK(ncp);
  898         rw_wlock(blp);
  899         cache_zap_locked(ncp, false);
  900         rw_wunlock(blp);
  901 }
  902 
  903 static bool
  904 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
  905     struct mtx **vlpp)
  906 {
  907         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
  908         struct rwlock *blp;
  909 
  910         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
  911         cache_assert_vnode_locked(vp);
  912 
  913         if (ncp->nc_flag & NCF_NEGATIVE) {
  914                 if (*vlpp != NULL) {
  915                         mtx_unlock(*vlpp);
  916                         *vlpp = NULL;
  917                 }
  918                 cache_zap_negative_locked_vnode_kl(ncp, vp);
  919                 return (true);
  920         }
  921 
  922         pvlp = VP2VNODELOCK(vp);
  923         blp = NCP2BUCKETLOCK(ncp);
  924         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
  925         vlp2 = VP2VNODELOCK(ncp->nc_vp);
  926 
  927         if (*vlpp == vlp1 || *vlpp == vlp2) {
  928                 to_unlock = *vlpp;
  929                 *vlpp = NULL;
  930         } else {
  931                 if (*vlpp != NULL) {
  932                         mtx_unlock(*vlpp);
  933                         *vlpp = NULL;
  934                 }
  935                 cache_sort_vnodes(&vlp1, &vlp2);
  936                 if (vlp1 == pvlp) {
  937                         mtx_lock(vlp2);
  938                         to_unlock = vlp2;
  939                 } else {
  940                         if (!mtx_trylock(vlp1))
  941                                 goto out_relock;
  942                         to_unlock = vlp1;
  943                 }
  944         }
  945         rw_wlock(blp);
  946         cache_zap_locked(ncp, false);
  947         rw_wunlock(blp);
  948         if (to_unlock != NULL)
  949                 mtx_unlock(to_unlock);
  950         return (true);
  951 
  952 out_relock:
  953         mtx_unlock(vlp2);
  954         mtx_lock(vlp1);
  955         mtx_lock(vlp2);
  956         MPASS(*vlpp == NULL);
  957         *vlpp = vlp1;
  958         return (false);
  959 }
  960 
  961 static int __noinline
  962 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
  963 {
  964         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
  965         struct rwlock *blp;
  966         int error = 0;
  967 
  968         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
  969         cache_assert_vnode_locked(vp);
  970 
  971         pvlp = VP2VNODELOCK(vp);
  972         if (ncp->nc_flag & NCF_NEGATIVE) {
  973                 cache_zap_negative_locked_vnode_kl(ncp, vp);
  974                 goto out;
  975         }
  976 
  977         blp = NCP2BUCKETLOCK(ncp);
  978         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
  979         vlp2 = VP2VNODELOCK(ncp->nc_vp);
  980         cache_sort_vnodes(&vlp1, &vlp2);
  981         if (vlp1 == pvlp) {
  982                 mtx_lock(vlp2);
  983                 to_unlock = vlp2;
  984         } else {
  985                 if (!mtx_trylock(vlp1)) {
  986                         error = EAGAIN;
  987                         goto out;
  988                 }
  989                 to_unlock = vlp1;
  990         }
  991         rw_wlock(blp);
  992         cache_zap_locked(ncp, false);
  993         rw_wunlock(blp);
  994         mtx_unlock(to_unlock);
  995 out:
  996         mtx_unlock(pvlp);
  997         return (error);
  998 }
  999 
 1000 /*
 1001  * If trylocking failed we can get here. We know enough to take all needed locks
 1002  * in the right order and re-lookup the entry.
 1003  */
 1004 static int
 1005 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
 1006     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
 1007     struct rwlock *blp)
 1008 {
 1009         struct namecache *rncp;
 1010 
 1011         cache_assert_bucket_locked(ncp, RA_UNLOCKED);
 1012 
 1013         cache_sort_vnodes(&dvlp, &vlp);
 1014         cache_lock_vnodes(dvlp, vlp);
 1015         rw_wlock(blp);
 1016         LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 1017                 if (rncp == ncp && rncp->nc_dvp == dvp &&
 1018                     rncp->nc_nlen == cnp->cn_namelen &&
 1019                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
 1020                         break;
 1021         }
 1022         if (rncp != NULL) {
 1023                 cache_zap_locked(rncp, false);
 1024                 rw_wunlock(blp);
 1025                 cache_unlock_vnodes(dvlp, vlp);
 1026                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
 1027                 return (0);
 1028         }
 1029 
 1030         rw_wunlock(blp);
 1031         cache_unlock_vnodes(dvlp, vlp);
 1032         return (EAGAIN);
 1033 }
 1034 
 1035 static int __noinline
 1036 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
 1037     uint32_t hash, struct rwlock *blp)
 1038 {
 1039         struct mtx *dvlp, *vlp;
 1040         struct vnode *dvp;
 1041 
 1042         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 1043 
 1044         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1045         vlp = NULL;
 1046         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1047                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1048         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1049                 cache_zap_locked(ncp, false);
 1050                 rw_wunlock(blp);
 1051                 cache_unlock_vnodes(dvlp, vlp);
 1052                 return (0);
 1053         }
 1054 
 1055         dvp = ncp->nc_dvp;
 1056         rw_wunlock(blp);
 1057         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 1058 }
 1059 
 1060 static int __noinline
 1061 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
 1062     uint32_t hash, struct rwlock *blp)
 1063 {
 1064         struct mtx *dvlp, *vlp;
 1065         struct vnode *dvp;
 1066 
 1067         cache_assert_bucket_locked(ncp, RA_RLOCKED);
 1068 
 1069         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1070         vlp = NULL;
 1071         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1072                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1073         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1074                 rw_runlock(blp);
 1075                 rw_wlock(blp);
 1076                 cache_zap_locked(ncp, false);
 1077                 rw_wunlock(blp);
 1078                 cache_unlock_vnodes(dvlp, vlp);
 1079                 return (0);
 1080         }
 1081 
 1082         dvp = ncp->nc_dvp;
 1083         rw_runlock(blp);
 1084         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 1085 }
 1086 
 1087 static int
 1088 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
 1089     struct mtx **vlpp1, struct mtx **vlpp2)
 1090 {
 1091         struct mtx *dvlp, *vlp;
 1092 
 1093         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 1094 
 1095         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1096         vlp = NULL;
 1097         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1098                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1099         cache_sort_vnodes(&dvlp, &vlp);
 1100 
 1101         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
 1102                 cache_zap_locked(ncp, false);
 1103                 cache_unlock_vnodes(dvlp, vlp);
 1104                 *vlpp1 = NULL;
 1105                 *vlpp2 = NULL;
 1106                 return (0);
 1107         }
 1108 
 1109         if (*vlpp1 != NULL)
 1110                 mtx_unlock(*vlpp1);
 1111         if (*vlpp2 != NULL)
 1112                 mtx_unlock(*vlpp2);
 1113         *vlpp1 = NULL;
 1114         *vlpp2 = NULL;
 1115 
 1116         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1117                 cache_zap_locked(ncp, false);
 1118                 cache_unlock_vnodes(dvlp, vlp);
 1119                 return (0);
 1120         }
 1121 
 1122         rw_wunlock(blp);
 1123         *vlpp1 = dvlp;
 1124         *vlpp2 = vlp;
 1125         if (*vlpp1 != NULL)
 1126                 mtx_lock(*vlpp1);
 1127         mtx_lock(*vlpp2);
 1128         rw_wlock(blp);
 1129         return (EAGAIN);
 1130 }
 1131 
 1132 static void
 1133 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
 1134 {
 1135 
 1136         if (blp != NULL) {
 1137                 rw_runlock(blp);
 1138         } else {
 1139                 mtx_unlock(vlp);
 1140         }
 1141 }
 1142 
 1143 static int __noinline
 1144 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1145     struct timespec *tsp, int *ticksp)
 1146 {
 1147         int ltype;
 1148 
 1149         *vpp = dvp;
 1150         CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 1151                         dvp, cnp->cn_nameptr);
 1152         counter_u64_add(dothits, 1);
 1153         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 1154         if (tsp != NULL)
 1155                 timespecclear(tsp);
 1156         if (ticksp != NULL)
 1157                 *ticksp = ticks;
 1158         vrefact(*vpp);
 1159         /*
 1160          * When we lookup "." we still can be asked to lock it
 1161          * differently...
 1162          */
 1163         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 1164         if (ltype != VOP_ISLOCKED(*vpp)) {
 1165                 if (ltype == LK_EXCLUSIVE) {
 1166                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 1167                         if ((*vpp)->v_iflag & VI_DOOMED) {
 1168                                 /* forced unmount */
 1169                                 vrele(*vpp);
 1170                                 *vpp = NULL;
 1171                                 return (ENOENT);
 1172                         }
 1173                 } else
 1174                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 1175         }
 1176         return (-1);
 1177 }
 1178 
 1179 static __noinline int
 1180 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
 1181     struct componentname *cnp, struct timespec *tsp, int *ticksp)
 1182 {
 1183         struct namecache *ncp;
 1184         struct rwlock *blp;
 1185         struct mtx *dvlp, *dvlp2;
 1186         uint32_t hash;
 1187         int error;
 1188 
 1189         if (cnp->cn_namelen == 2 &&
 1190             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1191                 counter_u64_add(dotdothits, 1);
 1192                 dvlp = VP2VNODELOCK(dvp);
 1193                 dvlp2 = NULL;
 1194                 mtx_lock(dvlp);
 1195 retry_dotdot:
 1196                 ncp = dvp->v_cache_dd;
 1197                 if (ncp == NULL) {
 1198                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 1199                             "..", NULL);
 1200                         mtx_unlock(dvlp);
 1201                         if (dvlp2 != NULL)
 1202                                 mtx_unlock(dvlp2);
 1203                         return (0);
 1204                 }
 1205                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1206                         if (ncp->nc_dvp != dvp)
 1207                                 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
 1208                         if (!cache_zap_locked_vnode_kl2(ncp,
 1209                             dvp, &dvlp2))
 1210                                 goto retry_dotdot;
 1211                         MPASS(dvp->v_cache_dd == NULL);
 1212                         mtx_unlock(dvlp);
 1213                         if (dvlp2 != NULL)
 1214                                 mtx_unlock(dvlp2);
 1215                         cache_free(ncp);
 1216                 } else {
 1217                         dvp->v_cache_dd = NULL;
 1218                         mtx_unlock(dvlp);
 1219                         if (dvlp2 != NULL)
 1220                                 mtx_unlock(dvlp2);
 1221                 }
 1222                 return (0);
 1223         }
 1224 
 1225         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1226         blp = HASH2BUCKETLOCK(hash);
 1227 retry:
 1228         if (LIST_EMPTY(NCHHASH(hash)))
 1229                 goto out_no_entry;
 1230 
 1231         rw_wlock(blp);
 1232 
 1233         LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1234                 counter_u64_add(numchecks, 1);
 1235                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1236                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1237                         break;
 1238         }
 1239 
 1240         /* We failed to find an entry */
 1241         if (ncp == NULL) {
 1242                 rw_wunlock(blp);
 1243                 goto out_no_entry;
 1244         }
 1245 
 1246         error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
 1247         if (__predict_false(error != 0)) {
 1248                 zap_and_exit_bucket_fail++;
 1249                 cache_maybe_yield();
 1250                 goto retry;
 1251         }
 1252         counter_u64_add(numposzaps, 1);
 1253         cache_free(ncp);
 1254         return (0);
 1255 out_no_entry:
 1256         SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
 1257         counter_u64_add(nummisszap, 1);
 1258         return (0);
 1259 }
 1260 
 1261 /**
 1262  * Lookup a name in the name cache
 1263  *
 1264  * # Arguments
 1265  *
 1266  * - dvp:       Parent directory in which to search.
 1267  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
 1268  * - cnp:       Parameters of the name search.  The most interesting bits of
 1269  *              the cn_flags field have the following meanings:
 1270  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
 1271  *                      it up.
 1272  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
 1273  * - tsp:       Return storage for cache timestamp.  On a successful (positive
 1274  *              or negative) lookup, tsp will be filled with any timespec that
 1275  *              was stored when this cache entry was created.  However, it will
 1276  *              be clear for "." entries.
 1277  * - ticks:     Return storage for alternate cache timestamp.  On a successful
 1278  *              (positive or negative) lookup, it will contain the ticks value
 1279  *              that was current when the cache entry was created, unless cnp
 1280  *              was ".".
 1281  *
 1282  * # Returns
 1283  *
 1284  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
 1285  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
 1286  *              to a forced unmount.  vpp will not be modified.  If the entry
 1287  *              is a whiteout, then the ISWHITEOUT flag will be set in
 1288  *              cnp->cn_flags.
 1289  * - 0:         A cache miss.  vpp will not be modified.
 1290  *
 1291  * # Locking
 1292  *
 1293  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
 1294  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
 1295  * lock is not recursively acquired.
 1296  */
 1297 int
 1298 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1299     struct timespec *tsp, int *ticksp)
 1300 {
 1301         struct namecache_ts *ncp_ts;
 1302         struct namecache *ncp;
 1303         struct rwlock *blp;
 1304         struct mtx *dvlp;
 1305         uint32_t hash;
 1306         int error, ltype;
 1307 
 1308         if (__predict_false(!doingcache)) {
 1309                 cnp->cn_flags &= ~MAKEENTRY;
 1310                 return (0);
 1311         }
 1312 
 1313         counter_u64_add(numcalls, 1);
 1314 
 1315         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
 1316                 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 1317 
 1318         if ((cnp->cn_flags & MAKEENTRY) == 0)
 1319                 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
 1320 
 1321 retry:
 1322         blp = NULL;
 1323         dvlp = NULL;
 1324         error = 0;
 1325         if (cnp->cn_namelen == 2 &&
 1326             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1327                 counter_u64_add(dotdothits, 1);
 1328                 dvlp = VP2VNODELOCK(dvp);
 1329                 mtx_lock(dvlp);
 1330                 ncp = dvp->v_cache_dd;
 1331                 if (ncp == NULL) {
 1332                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 1333                             "..", NULL);
 1334                         mtx_unlock(dvlp);
 1335                         return (0);
 1336                 }
 1337                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1338                         if (ncp->nc_flag & NCF_NEGATIVE)
 1339                                 *vpp = NULL;
 1340                         else
 1341                                 *vpp = ncp->nc_vp;
 1342                 } else
 1343                         *vpp = ncp->nc_dvp;
 1344                 /* Return failure if negative entry was found. */
 1345                 if (*vpp == NULL)
 1346                         goto negative_success;
 1347                 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 1348                     dvp, cnp->cn_nameptr, *vpp);
 1349                 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
 1350                     *vpp);
 1351                 cache_out_ts(ncp, tsp, ticksp);
 1352                 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 1353                     NCF_DTS && tsp != NULL) {
 1354                         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1355                         *tsp = ncp_ts->nc_dotdottime;
 1356                 }
 1357                 goto success;
 1358         }
 1359 
 1360         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1361         blp = HASH2BUCKETLOCK(hash);
 1362         rw_rlock(blp);
 1363 
 1364         LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1365                 counter_u64_add(numchecks, 1);
 1366                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1367                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1368                         break;
 1369         }
 1370 
 1371         /* We failed to find an entry */
 1372         if (__predict_false(ncp == NULL)) {
 1373                 rw_runlock(blp);
 1374                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 1375                     NULL);
 1376                 counter_u64_add(nummiss, 1);
 1377                 return (0);
 1378         }
 1379 
 1380         if (ncp->nc_flag & NCF_NEGATIVE)
 1381                 goto negative_success;
 1382 
 1383         /* We found a "positive" match, return the vnode */
 1384         counter_u64_add(numposhits, 1);
 1385         *vpp = ncp->nc_vp;
 1386         CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 1387             dvp, cnp->cn_nameptr, *vpp, ncp);
 1388         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
 1389             *vpp);
 1390         cache_out_ts(ncp, tsp, ticksp);
 1391 success:
 1392         /*
 1393          * On success we return a locked and ref'd vnode as per the lookup
 1394          * protocol.
 1395          */
 1396         MPASS(dvp != *vpp);
 1397         ltype = 0;      /* silence gcc warning */
 1398         if (cnp->cn_flags & ISDOTDOT) {
 1399                 ltype = VOP_ISLOCKED(dvp);
 1400                 VOP_UNLOCK(dvp, 0);
 1401         }
 1402         vhold(*vpp);
 1403         cache_lookup_unlock(blp, dvlp);
 1404         error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
 1405         if (cnp->cn_flags & ISDOTDOT) {
 1406                 vn_lock(dvp, ltype | LK_RETRY);
 1407                 if (dvp->v_iflag & VI_DOOMED) {
 1408                         if (error == 0)
 1409                                 vput(*vpp);
 1410                         *vpp = NULL;
 1411                         return (ENOENT);
 1412                 }
 1413         }
 1414         if (error) {
 1415                 *vpp = NULL;
 1416                 goto retry;
 1417         }
 1418         if ((cnp->cn_flags & ISLASTCN) &&
 1419             (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
 1420                 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 1421         }
 1422         return (-1);
 1423 
 1424 negative_success:
 1425         /* We found a negative match, and want to create it, so purge */
 1426         if (cnp->cn_nameiop == CREATE) {
 1427                 counter_u64_add(numnegzaps, 1);
 1428                 goto zap_and_exit;
 1429         }
 1430 
 1431         counter_u64_add(numneghits, 1);
 1432         cache_negative_hit(ncp);
 1433         if (ncp->nc_flag & NCF_WHITE)
 1434                 cnp->cn_flags |= ISWHITEOUT;
 1435         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 1436             ncp->nc_name);
 1437         cache_out_ts(ncp, tsp, ticksp);
 1438         cache_lookup_unlock(blp, dvlp);
 1439         return (ENOENT);
 1440 
 1441 zap_and_exit:
 1442         if (blp != NULL)
 1443                 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
 1444         else
 1445                 error = cache_zap_locked_vnode(ncp, dvp);
 1446         if (__predict_false(error != 0)) {
 1447                 zap_and_exit_bucket_fail2++;
 1448                 cache_maybe_yield();
 1449                 goto retry;
 1450         }
 1451         cache_free(ncp);
 1452         return (0);
 1453 }
 1454 
 1455 struct celockstate {
 1456         struct mtx *vlp[3];
 1457         struct rwlock *blp[2];
 1458 };
 1459 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 1460 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 1461 
 1462 static inline void
 1463 cache_celockstate_init(struct celockstate *cel)
 1464 {
 1465 
 1466         bzero(cel, sizeof(*cel));
 1467 }
 1468 
 1469 static void
 1470 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
 1471     struct vnode *dvp)
 1472 {
 1473         struct mtx *vlp1, *vlp2;
 1474 
 1475         MPASS(cel->vlp[0] == NULL);
 1476         MPASS(cel->vlp[1] == NULL);
 1477         MPASS(cel->vlp[2] == NULL);
 1478 
 1479         MPASS(vp != NULL || dvp != NULL);
 1480 
 1481         vlp1 = VP2VNODELOCK(vp);
 1482         vlp2 = VP2VNODELOCK(dvp);
 1483         cache_sort_vnodes(&vlp1, &vlp2);
 1484 
 1485         if (vlp1 != NULL) {
 1486                 mtx_lock(vlp1);
 1487                 cel->vlp[0] = vlp1;
 1488         }
 1489         mtx_lock(vlp2);
 1490         cel->vlp[1] = vlp2;
 1491 }
 1492 
 1493 static void
 1494 cache_unlock_vnodes_cel(struct celockstate *cel)
 1495 {
 1496 
 1497         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 1498 
 1499         if (cel->vlp[0] != NULL)
 1500                 mtx_unlock(cel->vlp[0]);
 1501         if (cel->vlp[1] != NULL)
 1502                 mtx_unlock(cel->vlp[1]);
 1503         if (cel->vlp[2] != NULL)
 1504                 mtx_unlock(cel->vlp[2]);
 1505 }
 1506 
 1507 static bool
 1508 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 1509 {
 1510         struct mtx *vlp;
 1511         bool ret;
 1512 
 1513         cache_assert_vlp_locked(cel->vlp[0]);
 1514         cache_assert_vlp_locked(cel->vlp[1]);
 1515         MPASS(cel->vlp[2] == NULL);
 1516 
 1517         MPASS(vp != NULL);
 1518         vlp = VP2VNODELOCK(vp);
 1519 
 1520         ret = true;
 1521         if (vlp >= cel->vlp[1]) {
 1522                 mtx_lock(vlp);
 1523         } else {
 1524                 if (mtx_trylock(vlp))
 1525                         goto out;
 1526                 cache_lock_vnodes_cel_3_failures++;
 1527                 cache_unlock_vnodes_cel(cel);
 1528                 if (vlp < cel->vlp[0]) {
 1529                         mtx_lock(vlp);
 1530                         mtx_lock(cel->vlp[0]);
 1531                         mtx_lock(cel->vlp[1]);
 1532                 } else {
 1533                         if (cel->vlp[0] != NULL)
 1534                                 mtx_lock(cel->vlp[0]);
 1535                         mtx_lock(vlp);
 1536                         mtx_lock(cel->vlp[1]);
 1537                 }
 1538                 ret = false;
 1539         }
 1540 out:
 1541         cel->vlp[2] = vlp;
 1542         return (ret);
 1543 }
 1544 
 1545 static void
 1546 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
 1547     struct rwlock *blp2)
 1548 {
 1549 
 1550         MPASS(cel->blp[0] == NULL);
 1551         MPASS(cel->blp[1] == NULL);
 1552 
 1553         cache_sort_vnodes(&blp1, &blp2);
 1554 
 1555         if (blp1 != NULL) {
 1556                 rw_wlock(blp1);
 1557                 cel->blp[0] = blp1;
 1558         }
 1559         rw_wlock(blp2);
 1560         cel->blp[1] = blp2;
 1561 }
 1562 
 1563 static void
 1564 cache_unlock_buckets_cel(struct celockstate *cel)
 1565 {
 1566 
 1567         if (cel->blp[0] != NULL)
 1568                 rw_wunlock(cel->blp[0]);
 1569         rw_wunlock(cel->blp[1]);
 1570 }
 1571 
 1572 /*
 1573  * Lock part of the cache affected by the insertion.
 1574  *
 1575  * This means vnodelocks for dvp, vp and the relevant bucketlock.
 1576  * However, insertion can result in removal of an old entry. In this
 1577  * case we have an additional vnode and bucketlock pair to lock. If the
 1578  * entry is negative, ncelock is locked instead of the vnode.
 1579  *
 1580  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
 1581  * preserving the locking order (smaller address first).
 1582  */
 1583 static void
 1584 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 1585     uint32_t hash)
 1586 {
 1587         struct namecache *ncp;
 1588         struct rwlock *blps[2];
 1589 
 1590         blps[0] = HASH2BUCKETLOCK(hash);
 1591         for (;;) {
 1592                 blps[1] = NULL;
 1593                 cache_lock_vnodes_cel(cel, dvp, vp);
 1594                 if (vp == NULL || vp->v_type != VDIR)
 1595                         break;
 1596                 ncp = vp->v_cache_dd;
 1597                 if (ncp == NULL)
 1598                         break;
 1599                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 1600                         break;
 1601                 MPASS(ncp->nc_dvp == vp);
 1602                 blps[1] = NCP2BUCKETLOCK(ncp);
 1603                 if (ncp->nc_flag & NCF_NEGATIVE)
 1604                         break;
 1605                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 1606                         break;
 1607                 /*
 1608                  * All vnodes got re-locked. Re-validate the state and if
 1609                  * nothing changed we are done. Otherwise restart.
 1610                  */
 1611                 if (ncp == vp->v_cache_dd &&
 1612                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 1613                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 1614                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 1615                         break;
 1616                 cache_unlock_vnodes_cel(cel);
 1617                 cel->vlp[0] = NULL;
 1618                 cel->vlp[1] = NULL;
 1619                 cel->vlp[2] = NULL;
 1620         }
 1621         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 1622 }
 1623 
 1624 static void
 1625 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 1626     uint32_t hash)
 1627 {
 1628         struct namecache *ncp;
 1629         struct rwlock *blps[2];
 1630 
 1631         blps[0] = HASH2BUCKETLOCK(hash);
 1632         for (;;) {
 1633                 blps[1] = NULL;
 1634                 cache_lock_vnodes_cel(cel, dvp, vp);
 1635                 ncp = dvp->v_cache_dd;
 1636                 if (ncp == NULL)
 1637                         break;
 1638                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 1639                         break;
 1640                 MPASS(ncp->nc_dvp == dvp);
 1641                 blps[1] = NCP2BUCKETLOCK(ncp);
 1642                 if (ncp->nc_flag & NCF_NEGATIVE)
 1643                         break;
 1644                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 1645                         break;
 1646                 if (ncp == dvp->v_cache_dd &&
 1647                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 1648                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 1649                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 1650                         break;
 1651                 cache_unlock_vnodes_cel(cel);
 1652                 cel->vlp[0] = NULL;
 1653                 cel->vlp[1] = NULL;
 1654                 cel->vlp[2] = NULL;
 1655         }
 1656         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 1657 }
 1658 
 1659 static void
 1660 cache_enter_unlock(struct celockstate *cel)
 1661 {
 1662 
 1663         cache_unlock_buckets_cel(cel);
 1664         cache_unlock_vnodes_cel(cel);
 1665 }
 1666 
 1667 static void __noinline
 1668 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
 1669     struct componentname *cnp)
 1670 {
 1671         struct celockstate cel;
 1672         struct namecache *ncp;
 1673         uint32_t hash;
 1674         int len;
 1675 
 1676         if (dvp->v_cache_dd == NULL)
 1677                 return;
 1678         len = cnp->cn_namelen;
 1679         cache_celockstate_init(&cel);
 1680         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 1681         cache_enter_lock_dd(&cel, dvp, vp, hash);
 1682         ncp = dvp->v_cache_dd;
 1683         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
 1684                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
 1685                 cache_zap_locked(ncp, false);
 1686         } else {
 1687                 ncp = NULL;
 1688         }
 1689         dvp->v_cache_dd = NULL;
 1690         cache_enter_unlock(&cel);
 1691         cache_free(ncp);
 1692 }
 1693 
 1694 /*
 1695  * Add an entry to the cache.
 1696  */
 1697 void
 1698 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 1699     struct timespec *tsp, struct timespec *dtsp)
 1700 {
 1701         struct celockstate cel;
 1702         struct namecache *ncp, *n2, *ndd;
 1703         struct namecache_ts *ncp_ts, *n2_ts;
 1704         struct nchashhead *ncpp;
 1705         uint32_t hash;
 1706         int flag;
 1707         int len;
 1708         u_long lnumcache;
 1709 
 1710         CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 1711         VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 1712             ("cache_enter: Adding a doomed vnode"));
 1713         VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
 1714             ("cache_enter: Doomed vnode used as src"));
 1715 
 1716         if (__predict_false(!doingcache))
 1717                 return;
 1718 
 1719         flag = 0;
 1720         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 1721                 if (cnp->cn_namelen == 1)
 1722                         return;
 1723                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 1724                         cache_enter_dotdot_prep(dvp, vp, cnp);
 1725                         flag = NCF_ISDOTDOT;
 1726                 }
 1727         }
 1728 
 1729         /*
 1730          * Avoid blowout in namecache entries.
 1731          */
 1732         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 1733         if (__predict_false(lnumcache >= ncsize)) {
 1734                 atomic_add_long(&numcache, -1);
 1735                 return;
 1736         }
 1737 
 1738         cache_celockstate_init(&cel);
 1739         ndd = NULL;
 1740         ncp_ts = NULL;
 1741 
 1742         /*
 1743          * Calculate the hash key and setup as much of the new
 1744          * namecache entry as possible before acquiring the lock.
 1745          */
 1746         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 1747         ncp->nc_flag = flag;
 1748         ncp->nc_vp = vp;
 1749         if (vp == NULL)
 1750                 ncp->nc_flag |= NCF_NEGATIVE;
 1751         ncp->nc_dvp = dvp;
 1752         if (tsp != NULL) {
 1753                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1754                 ncp_ts->nc_time = *tsp;
 1755                 ncp_ts->nc_ticks = ticks;
 1756                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
 1757                 if (dtsp != NULL) {
 1758                         ncp_ts->nc_dotdottime = *dtsp;
 1759                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 1760                 }
 1761         }
 1762         len = ncp->nc_nlen = cnp->cn_namelen;
 1763         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 1764         strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
 1765         cache_enter_lock(&cel, dvp, vp, hash);
 1766 
 1767         /*
 1768          * See if this vnode or negative entry is already in the cache
 1769          * with this name.  This can happen with concurrent lookups of
 1770          * the same path name.
 1771          */
 1772         ncpp = NCHHASH(hash);
 1773         LIST_FOREACH(n2, ncpp, nc_hash) {
 1774                 if (n2->nc_dvp == dvp &&
 1775                     n2->nc_nlen == cnp->cn_namelen &&
 1776                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 1777                         if (tsp != NULL) {
 1778                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
 1779                                     ("no NCF_TS"));
 1780                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 1781                                 n2_ts->nc_time = ncp_ts->nc_time;
 1782                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
 1783                                 if (dtsp != NULL) {
 1784                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 1785                                         if (ncp->nc_flag & NCF_NEGATIVE)
 1786                                                 mtx_lock(&ncneg_hot.nl_lock);
 1787                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
 1788                                         if (ncp->nc_flag & NCF_NEGATIVE)
 1789                                                 mtx_unlock(&ncneg_hot.nl_lock);
 1790                                 }
 1791                         }
 1792                         goto out_unlock_free;
 1793                 }
 1794         }
 1795 
 1796         if (flag == NCF_ISDOTDOT) {
 1797                 /*
 1798                  * See if we are trying to add .. entry, but some other lookup
 1799                  * has populated v_cache_dd pointer already.
 1800                  */
 1801                 if (dvp->v_cache_dd != NULL)
 1802                         goto out_unlock_free;
 1803                 KASSERT(vp == NULL || vp->v_type == VDIR,
 1804                     ("wrong vnode type %p", vp));
 1805                 dvp->v_cache_dd = ncp;
 1806         }
 1807 
 1808         if (vp != NULL) {
 1809                 if (vp->v_type == VDIR) {
 1810                         if (flag != NCF_ISDOTDOT) {
 1811                                 /*
 1812                                  * For this case, the cache entry maps both the
 1813                                  * directory name in it and the name ".." for the
 1814                                  * directory's parent.
 1815                                  */
 1816                                 if ((ndd = vp->v_cache_dd) != NULL) {
 1817                                         if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 1818                                                 cache_zap_locked(ndd, false);
 1819                                         else
 1820                                                 ndd = NULL;
 1821                                 }
 1822                                 vp->v_cache_dd = ncp;
 1823                         }
 1824                 } else {
 1825                         vp->v_cache_dd = NULL;
 1826                 }
 1827         }
 1828 
 1829         if (flag != NCF_ISDOTDOT) {
 1830                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 1831                         vhold(dvp);
 1832                         counter_u64_add(numcachehv, 1);
 1833                 }
 1834                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 1835         }
 1836 
 1837         /*
 1838          * Insert the new namecache entry into the appropriate chain
 1839          * within the cache entries table.
 1840          */
 1841         LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 1842 
 1843         /*
 1844          * If the entry is "negative", we place it into the
 1845          * "negative" cache queue, otherwise, we place it into the
 1846          * destination vnode's cache entries queue.
 1847          */
 1848         if (vp != NULL) {
 1849                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 1850                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 1851                     vp);
 1852         } else {
 1853                 if (cnp->cn_flags & ISWHITEOUT)
 1854                         ncp->nc_flag |= NCF_WHITE;
 1855                 cache_negative_insert(ncp, false);
 1856                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 1857                     ncp->nc_name);
 1858         }
 1859         cache_enter_unlock(&cel);
 1860         if (numneg * ncnegfactor > lnumcache)
 1861                 cache_negative_zap_one();
 1862         cache_free(ndd);
 1863         return;
 1864 out_unlock_free:
 1865         cache_enter_unlock(&cel);
 1866         atomic_add_long(&numcache, -1);
 1867         cache_free(ncp);
 1868         return;
 1869 }
 1870 
 1871 static u_int
 1872 cache_roundup_2(u_int val)
 1873 {
 1874         u_int res;
 1875 
 1876         for (res = 1; res <= val; res <<= 1)
 1877                 continue;
 1878 
 1879         return (res);
 1880 }
 1881 
 1882 /*
 1883  * Name cache initialization, from vfs_init() when we are booting
 1884  */
 1885 static void
 1886 nchinit(void *dummy __unused)
 1887 {
 1888         u_int i;
 1889 
 1890         cache_zone_small = uma_zcreate("S VFS Cache",
 1891             sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
 1892             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
 1893             UMA_ZONE_ZINIT);
 1894         cache_zone_small_ts = uma_zcreate("STS VFS Cache",
 1895             sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
 1896             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
 1897             UMA_ZONE_ZINIT);
 1898         cache_zone_large = uma_zcreate("L VFS Cache",
 1899             sizeof(struct namecache) + NAME_MAX + 1,
 1900             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
 1901             UMA_ZONE_ZINIT);
 1902         cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
 1903             sizeof(struct namecache_ts) + NAME_MAX + 1,
 1904             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
 1905             UMA_ZONE_ZINIT);
 1906 
 1907         ncsize = desiredvnodes * ncsizefactor;
 1908         nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 1909         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
 1910         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
 1911                 ncbuckethash = 7;
 1912         if (ncbuckethash > nchash)
 1913                 ncbuckethash = nchash;
 1914         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 1915             M_WAITOK | M_ZERO);
 1916         for (i = 0; i < numbucketlocks; i++)
 1917                 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
 1918         ncvnodehash = ncbuckethash;
 1919         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 1920             M_WAITOK | M_ZERO);
 1921         for (i = 0; i < numvnodelocks; i++)
 1922                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 1923         ncpurgeminvnodes = numbucketlocks * 2;
 1924 
 1925         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
 1926             M_WAITOK | M_ZERO);
 1927         for (i = 0; i < numneglists; i++) {
 1928                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 1929                 TAILQ_INIT(&neglists[i].nl_list);
 1930         }
 1931         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
 1932         TAILQ_INIT(&ncneg_hot.nl_list);
 1933 
 1934         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
 1935 
 1936         numcachehv = counter_u64_alloc(M_WAITOK);
 1937         numcalls = counter_u64_alloc(M_WAITOK);
 1938         dothits = counter_u64_alloc(M_WAITOK);
 1939         dotdothits = counter_u64_alloc(M_WAITOK);
 1940         numchecks = counter_u64_alloc(M_WAITOK);
 1941         nummiss = counter_u64_alloc(M_WAITOK);
 1942         nummisszap = counter_u64_alloc(M_WAITOK);
 1943         numposzaps = counter_u64_alloc(M_WAITOK);
 1944         numposhits = counter_u64_alloc(M_WAITOK);
 1945         numnegzaps = counter_u64_alloc(M_WAITOK);
 1946         numneghits = counter_u64_alloc(M_WAITOK);
 1947         numfullpathcalls = counter_u64_alloc(M_WAITOK);
 1948         numfullpathfail1 = counter_u64_alloc(M_WAITOK);
 1949         numfullpathfail2 = counter_u64_alloc(M_WAITOK);
 1950         numfullpathfail4 = counter_u64_alloc(M_WAITOK);
 1951         numfullpathfound = counter_u64_alloc(M_WAITOK);
 1952         zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK);
 1953         numneg_evicted = counter_u64_alloc(M_WAITOK);
 1954         shrinking_skipped = counter_u64_alloc(M_WAITOK);
 1955 }
 1956 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 1957 
 1958 void
 1959 cache_changesize(int newmaxvnodes)
 1960 {
 1961         struct nchashhead *new_nchashtbl, *old_nchashtbl;
 1962         u_long new_nchash, old_nchash;
 1963         struct namecache *ncp;
 1964         uint32_t hash;
 1965         int newncsize;
 1966         int i;
 1967 
 1968         newncsize = newmaxvnodes * ncsizefactor;
 1969         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 1970         if (newmaxvnodes < numbucketlocks)
 1971                 newmaxvnodes = numbucketlocks;
 1972 
 1973         new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
 1974         /* If same hash table size, nothing to do */
 1975         if (nchash == new_nchash) {
 1976                 free(new_nchashtbl, M_VFSCACHE);
 1977                 return;
 1978         }
 1979         /*
 1980          * Move everything from the old hash table to the new table.
 1981          * None of the namecache entries in the table can be removed
 1982          * because to do so, they have to be removed from the hash table.
 1983          */
 1984         cache_lock_all_vnodes();
 1985         cache_lock_all_buckets();
 1986         old_nchashtbl = nchashtbl;
 1987         old_nchash = nchash;
 1988         nchashtbl = new_nchashtbl;
 1989         nchash = new_nchash;
 1990         for (i = 0; i <= old_nchash; i++) {
 1991                 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
 1992                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 1993                             ncp->nc_dvp);
 1994                         LIST_REMOVE(ncp, nc_hash);
 1995                         LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 1996                 }
 1997         }
 1998         ncsize = newncsize;
 1999         cache_unlock_all_buckets();
 2000         cache_unlock_all_vnodes();
 2001         free(old_nchashtbl, M_VFSCACHE);
 2002 }
 2003 
 2004 /*
 2005  * Invalidate all entries from and to a particular vnode.
 2006  */
 2007 void
 2008 cache_purge(struct vnode *vp)
 2009 {
 2010         TAILQ_HEAD(, namecache) ncps;
 2011         struct namecache *ncp, *nnp;
 2012         struct mtx *vlp, *vlp2;
 2013 
 2014         CTR1(KTR_VFS, "cache_purge(%p)", vp);
 2015         SDT_PROBE1(vfs, namecache, purge, done, vp);
 2016         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 2017             vp->v_cache_dd == NULL)
 2018                 return;
 2019         TAILQ_INIT(&ncps);
 2020         vlp = VP2VNODELOCK(vp);
 2021         vlp2 = NULL;
 2022         mtx_lock(vlp);
 2023 retry:
 2024         while (!LIST_EMPTY(&vp->v_cache_src)) {
 2025                 ncp = LIST_FIRST(&vp->v_cache_src);
 2026                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2027                         goto retry;
 2028                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 2029         }
 2030         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 2031                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 2032                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2033                         goto retry;
 2034                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 2035         }
 2036         ncp = vp->v_cache_dd;
 2037         if (ncp != NULL) {
 2038                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 2039                    ("lost dotdot link"));
 2040                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 2041                         goto retry;
 2042                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 2043         }
 2044         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 2045         mtx_unlock(vlp);
 2046         if (vlp2 != NULL)
 2047                 mtx_unlock(vlp2);
 2048         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 2049                 cache_free(ncp);
 2050         }
 2051 }
 2052 
 2053 /*
 2054  * Invalidate all negative entries for a particular directory vnode.
 2055  */
 2056 void
 2057 cache_purge_negative(struct vnode *vp)
 2058 {
 2059         TAILQ_HEAD(, namecache) ncps;
 2060         struct namecache *ncp, *nnp;
 2061         struct mtx *vlp;
 2062 
 2063         CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
 2064         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 2065         if (LIST_EMPTY(&vp->v_cache_src))
 2066                 return;
 2067         TAILQ_INIT(&ncps);
 2068         vlp = VP2VNODELOCK(vp);
 2069         mtx_lock(vlp);
 2070         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 2071                 if (!(ncp->nc_flag & NCF_NEGATIVE))
 2072                         continue;
 2073                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 2074                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 2075         }
 2076         mtx_unlock(vlp);
 2077         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 2078                 cache_free(ncp);
 2079         }
 2080 }
 2081 
 2082 /*
 2083  * Flush all entries referencing a particular filesystem.
 2084  */
 2085 void
 2086 cache_purgevfs(struct mount *mp, bool force)
 2087 {
 2088         TAILQ_HEAD(, namecache) ncps;
 2089         struct mtx *vlp1, *vlp2;
 2090         struct rwlock *blp;
 2091         struct nchashhead *bucket;
 2092         struct namecache *ncp, *nnp;
 2093         u_long i, j, n_nchash;
 2094         int error;
 2095 
 2096         /* Scan hash tables for applicable entries */
 2097         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 2098         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
 2099                 return;
 2100         TAILQ_INIT(&ncps);
 2101         n_nchash = nchash + 1;
 2102         vlp1 = vlp2 = NULL;
 2103         for (i = 0; i < numbucketlocks; i++) {
 2104                 blp = (struct rwlock *)&bucketlocks[i];
 2105                 rw_wlock(blp);
 2106                 for (j = i; j < n_nchash; j += numbucketlocks) {
 2107 retry:
 2108                         bucket = &nchashtbl[j];
 2109                         LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
 2110                                 cache_assert_bucket_locked(ncp, RA_WLOCKED);
 2111                                 if (ncp->nc_dvp->v_mount != mp)
 2112                                         continue;
 2113                                 error = cache_zap_wlocked_bucket_kl(ncp, blp,
 2114                                     &vlp1, &vlp2);
 2115                                 if (error != 0)
 2116                                         goto retry;
 2117                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
 2118                         }
 2119                 }
 2120                 rw_wunlock(blp);
 2121                 if (vlp1 == NULL && vlp2 == NULL)
 2122                         cache_maybe_yield();
 2123         }
 2124         if (vlp1 != NULL)
 2125                 mtx_unlock(vlp1);
 2126         if (vlp2 != NULL)
 2127                 mtx_unlock(vlp2);
 2128 
 2129         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 2130                 cache_free(ncp);
 2131         }
 2132 }
 2133 
 2134 /*
 2135  * Perform canonical checks and cache lookup and pass on to filesystem
 2136  * through the vop_cachedlookup only if needed.
 2137  */
 2138 
 2139 int
 2140 vfs_cache_lookup(struct vop_lookup_args *ap)
 2141 {
 2142         struct vnode *dvp;
 2143         int error;
 2144         struct vnode **vpp = ap->a_vpp;
 2145         struct componentname *cnp = ap->a_cnp;
 2146         int flags = cnp->cn_flags;
 2147 
 2148         *vpp = NULL;
 2149         dvp = ap->a_dvp;
 2150 
 2151         if (dvp->v_type != VDIR)
 2152                 return (ENOTDIR);
 2153 
 2154         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 2155             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 2156                 return (EROFS);
 2157 
 2158         error = vn_dir_check_exec(dvp, cnp);
 2159         if (error != 0)
 2160                 return (error);
 2161 
 2162         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 2163         if (error == 0)
 2164                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 2165         if (error == -1)
 2166                 return (0);
 2167         return (error);
 2168 }
 2169 
 2170 /*
 2171  * XXX All of these sysctls would probably be more productive dead.
 2172  */
 2173 static int __read_mostly disablecwd;
 2174 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
 2175    "Disable the getcwd syscall");
 2176 
 2177 /* Implementation of the getcwd syscall. */
 2178 int
 2179 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 2180 {
 2181 
 2182         return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
 2183             MAXPATHLEN));
 2184 }
 2185 
 2186 int
 2187 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen,
 2188     size_t path_max)
 2189 {
 2190         char *bp, *tmpbuf;
 2191         struct filedesc *fdp;
 2192         struct vnode *cdir, *rdir;
 2193         int error;
 2194 
 2195         if (__predict_false(disablecwd))
 2196                 return (ENODEV);
 2197         if (__predict_false(buflen < 2))
 2198                 return (EINVAL);
 2199         if (buflen > path_max)
 2200                 buflen = path_max;
 2201 
 2202         tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 2203         fdp = td->td_proc->p_fd;
 2204         FILEDESC_SLOCK(fdp);
 2205         cdir = fdp->fd_cdir;
 2206         vrefact(cdir);
 2207         rdir = fdp->fd_rdir;
 2208         vrefact(rdir);
 2209         FILEDESC_SUNLOCK(fdp);
 2210         error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
 2211         vrele(rdir);
 2212         vrele(cdir);
 2213 
 2214         if (!error) {
 2215                 if (bufseg == UIO_SYSSPACE)
 2216                         bcopy(bp, buf, strlen(bp) + 1);
 2217                 else
 2218                         error = copyout(bp, buf, strlen(bp) + 1);
 2219 #ifdef KTRACE
 2220         if (KTRPOINT(curthread, KTR_NAMEI))
 2221                 ktrnamei(bp);
 2222 #endif
 2223         }
 2224         free(tmpbuf, M_TEMP);
 2225         return (error);
 2226 }
 2227 
 2228 /*
 2229  * Thus begins the fullpath magic.
 2230  */
 2231 
 2232 static int __read_mostly disablefullpath;
 2233 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
 2234     "Disable the vn_fullpath function");
 2235 
 2236 /*
 2237  * Retrieve the full filesystem path that correspond to a vnode from the name
 2238  * cache (if available)
 2239  */
 2240 int
 2241 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 2242 {
 2243         char *buf;
 2244         struct filedesc *fdp;
 2245         struct vnode *rdir;
 2246         int error;
 2247 
 2248         if (__predict_false(disablefullpath))
 2249                 return (ENODEV);
 2250         if (__predict_false(vn == NULL))
 2251                 return (EINVAL);
 2252 
 2253         buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 2254         fdp = td->td_proc->p_fd;
 2255         FILEDESC_SLOCK(fdp);
 2256         rdir = fdp->fd_rdir;
 2257         vrefact(rdir);
 2258         FILEDESC_SUNLOCK(fdp);
 2259         error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
 2260         vrele(rdir);
 2261 
 2262         if (!error)
 2263                 *freebuf = buf;
 2264         else
 2265                 free(buf, M_TEMP);
 2266         return (error);
 2267 }
 2268 
 2269 /*
 2270  * This function is similar to vn_fullpath, but it attempts to lookup the
 2271  * pathname relative to the global root mount point.  This is required for the
 2272  * auditing sub-system, as audited pathnames must be absolute, relative to the
 2273  * global root mount point.
 2274  */
 2275 int
 2276 vn_fullpath_global(struct thread *td, struct vnode *vn,
 2277     char **retbuf, char **freebuf)
 2278 {
 2279         char *buf;
 2280         int error;
 2281 
 2282         if (__predict_false(disablefullpath))
 2283                 return (ENODEV);
 2284         if (__predict_false(vn == NULL))
 2285                 return (EINVAL);
 2286         buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 2287         error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
 2288         if (!error)
 2289                 *freebuf = buf;
 2290         else
 2291                 free(buf, M_TEMP);
 2292         return (error);
 2293 }
 2294 
 2295 int
 2296 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
 2297 {
 2298         struct vnode *dvp;
 2299         struct namecache *ncp;
 2300         struct mtx *vlp;
 2301         int error;
 2302 
 2303         vlp = VP2VNODELOCK(*vp);
 2304         mtx_lock(vlp);
 2305         TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 2306                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 2307                         break;
 2308         }
 2309         if (ncp != NULL) {
 2310                 if (*buflen < ncp->nc_nlen) {
 2311                         mtx_unlock(vlp);
 2312                         vrele(*vp);
 2313                         counter_u64_add(numfullpathfail4, 1);
 2314                         error = ENOMEM;
 2315                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 2316                             vp, NULL);
 2317                         return (error);
 2318                 }
 2319                 *buflen -= ncp->nc_nlen;
 2320                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 2321                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 2322                     ncp->nc_name, vp);
 2323                 dvp = *vp;
 2324                 *vp = ncp->nc_dvp;
 2325                 vref(*vp);
 2326                 mtx_unlock(vlp);
 2327                 vrele(dvp);
 2328                 return (0);
 2329         }
 2330         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 2331 
 2332         mtx_unlock(vlp);
 2333         vn_lock(*vp, LK_SHARED | LK_RETRY);
 2334         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 2335         vput(*vp);
 2336         if (error) {
 2337                 counter_u64_add(numfullpathfail2, 1);
 2338                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 2339                 return (error);
 2340         }
 2341 
 2342         *vp = dvp;
 2343         if (dvp->v_iflag & VI_DOOMED) {
 2344                 /* forced unmount */
 2345                 vrele(dvp);
 2346                 error = ENOENT;
 2347                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 2348                 return (error);
 2349         }
 2350         /*
 2351          * *vp has its use count incremented still.
 2352          */
 2353 
 2354         return (0);
 2355 }
 2356 
 2357 /*
 2358  * The magic behind kern___getcwd() and vn_fullpath().
 2359  */
 2360 static int
 2361 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 2362     char *buf, char **retbuf, u_int buflen)
 2363 {
 2364         int error, slash_prefixed;
 2365 #ifdef KDTRACE_HOOKS
 2366         struct vnode *startvp = vp;
 2367 #endif
 2368         struct vnode *vp1;
 2369 
 2370         buflen--;
 2371         buf[buflen] = '\0';
 2372         error = 0;
 2373         slash_prefixed = 0;
 2374 
 2375         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 2376         counter_u64_add(numfullpathcalls, 1);
 2377         vref(vp);
 2378         if (vp->v_type != VDIR) {
 2379                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 2380                 if (error)
 2381                         return (error);
 2382                 if (buflen == 0) {
 2383                         vrele(vp);
 2384                         return (ENOMEM);
 2385                 }
 2386                 buf[--buflen] = '/';
 2387                 slash_prefixed = 1;
 2388         }
 2389         while (vp != rdir && vp != rootvnode) {
 2390                 /*
 2391                  * The vp vnode must be already fully constructed,
 2392                  * since it is either found in namecache or obtained
 2393                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 2394                  * without obtaining the vnode lock.
 2395                  */
 2396                 if ((vp->v_vflag & VV_ROOT) != 0) {
 2397                         vn_lock(vp, LK_RETRY | LK_SHARED);
 2398 
 2399                         /*
 2400                          * With the vnode locked, check for races with
 2401                          * unmount, forced or not.  Note that we
 2402                          * already verified that vp is not equal to
 2403                          * the root vnode, which means that
 2404                          * mnt_vnodecovered can be NULL only for the
 2405                          * case of unmount.
 2406                          */
 2407                         if ((vp->v_iflag & VI_DOOMED) != 0 ||
 2408                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 2409                             vp1->v_mountedhere != vp->v_mount) {
 2410                                 vput(vp);
 2411                                 error = ENOENT;
 2412                                 SDT_PROBE3(vfs, namecache, fullpath, return,
 2413                                     error, vp, NULL);
 2414                                 break;
 2415                         }
 2416 
 2417                         vref(vp1);
 2418                         vput(vp);
 2419                         vp = vp1;
 2420                         continue;
 2421                 }
 2422                 if (vp->v_type != VDIR) {
 2423                         vrele(vp);
 2424                         counter_u64_add(numfullpathfail1, 1);
 2425                         error = ENOTDIR;
 2426                         SDT_PROBE3(vfs, namecache, fullpath, return,
 2427                             error, vp, NULL);
 2428                         break;
 2429                 }
 2430                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 2431                 if (error)
 2432                         break;
 2433                 if (buflen == 0) {
 2434                         vrele(vp);
 2435                         error = ENOMEM;
 2436                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 2437                             startvp, NULL);
 2438                         break;
 2439                 }
 2440                 buf[--buflen] = '/';
 2441                 slash_prefixed = 1;
 2442         }
 2443         if (error)
 2444                 return (error);
 2445         if (!slash_prefixed) {
 2446                 if (buflen == 0) {
 2447                         vrele(vp);
 2448                         counter_u64_add(numfullpathfail4, 1);
 2449                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 2450                             startvp, NULL);
 2451                         return (ENOMEM);
 2452                 }
 2453                 buf[--buflen] = '/';
 2454         }
 2455         counter_u64_add(numfullpathfound, 1);
 2456         vrele(vp);
 2457 
 2458         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
 2459         *retbuf = buf + buflen;
 2460         return (0);
 2461 }
 2462 
 2463 struct vnode *
 2464 vn_dir_dd_ino(struct vnode *vp)
 2465 {
 2466         struct namecache *ncp;
 2467         struct vnode *ddvp;
 2468         struct mtx *vlp;
 2469 
 2470         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 2471         vlp = VP2VNODELOCK(vp);
 2472         mtx_lock(vlp);
 2473         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 2474                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 2475                         continue;
 2476                 ddvp = ncp->nc_dvp;
 2477                 vhold(ddvp);
 2478                 mtx_unlock(vlp);
 2479                 if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
 2480                         return (NULL);
 2481                 return (ddvp);
 2482         }
 2483         mtx_unlock(vlp);
 2484         return (NULL);
 2485 }
 2486 
 2487 int
 2488 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 2489 {
 2490         struct namecache *ncp;
 2491         struct mtx *vlp;
 2492         int l;
 2493 
 2494         vlp = VP2VNODELOCK(vp);
 2495         mtx_lock(vlp);
 2496         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 2497                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 2498                         break;
 2499         if (ncp == NULL) {
 2500                 mtx_unlock(vlp);
 2501                 return (ENOENT);
 2502         }
 2503         l = min(ncp->nc_nlen, buflen - 1);
 2504         memcpy(buf, ncp->nc_name, l);
 2505         mtx_unlock(vlp);
 2506         buf[l] = '\0';
 2507         return (0);
 2508 }
 2509 
 2510 /* ABI compat shims for old kernel modules. */
 2511 #undef cache_enter
 2512 
 2513 void    cache_enter(struct vnode *dvp, struct vnode *vp,
 2514             struct componentname *cnp);
 2515 
 2516 void
 2517 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 2518 {
 2519 
 2520         cache_enter_time(dvp, vp, cnp, NULL, NULL);
 2521 }
 2522 
 2523 /*
 2524  * This function updates path string to vnode's full global path
 2525  * and checks the size of the new path string against the pathlen argument.
 2526  *
 2527  * Requires a locked, referenced vnode.
 2528  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 2529  *
 2530  * If sysctl debug.disablefullpath is set, ENODEV is returned,
 2531  * vnode is left locked and path remain untouched.
 2532  *
 2533  * If vp is a directory, the call to vn_fullpath_global() always succeeds
 2534  * because it falls back to the ".." lookup if the namecache lookup fails.
 2535  */
 2536 int
 2537 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
 2538     u_int pathlen)
 2539 {
 2540         struct nameidata nd;
 2541         struct vnode *vp1;
 2542         char *rpath, *fbuf;
 2543         int error;
 2544 
 2545         ASSERT_VOP_ELOCKED(vp, __func__);
 2546 
 2547         /* Return ENODEV if sysctl debug.disablefullpath==1 */
 2548         if (__predict_false(disablefullpath))
 2549                 return (ENODEV);
 2550 
 2551         /* Construct global filesystem path from vp. */
 2552         VOP_UNLOCK(vp, 0);
 2553         error = vn_fullpath_global(td, vp, &rpath, &fbuf);
 2554 
 2555         if (error != 0) {
 2556                 vrele(vp);
 2557                 return (error);
 2558         }
 2559 
 2560         if (strlen(rpath) >= pathlen) {
 2561                 vrele(vp);
 2562                 error = ENAMETOOLONG;
 2563                 goto out;
 2564         }
 2565 
 2566         /*
 2567          * Re-lookup the vnode by path to detect a possible rename.
 2568          * As a side effect, the vnode is relocked.
 2569          * If vnode was renamed, return ENOENT.
 2570          */
 2571         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 2572             UIO_SYSSPACE, path, td);
 2573         error = namei(&nd);
 2574         if (error != 0) {
 2575                 vrele(vp);
 2576                 goto out;
 2577         }
 2578         NDFREE(&nd, NDF_ONLY_PNBUF);
 2579         vp1 = nd.ni_vp;
 2580         vrele(vp);
 2581         if (vp1 == vp)
 2582                 strcpy(path, rpath);
 2583         else {
 2584                 vput(vp1);
 2585                 error = ENOENT;
 2586         }
 2587 
 2588 out:
 2589         free(fbuf, M_TEMP);
 2590         return (error);
 2591 }
 2592 
 2593 #ifdef DDB
 2594 static void
 2595 db_print_vpath(struct vnode *vp)
 2596 {
 2597 
 2598         while (vp != NULL) {
 2599                 db_printf("%p: ", vp);
 2600                 if (vp == rootvnode) {
 2601                         db_printf("/");
 2602                         vp = NULL;
 2603                 } else {
 2604                         if (vp->v_vflag & VV_ROOT) {
 2605                                 db_printf("<mount point>");
 2606                                 vp = vp->v_mount->mnt_vnodecovered;
 2607                         } else {
 2608                                 struct namecache *ncp;
 2609                                 char *ncn;
 2610                                 int i;
 2611 
 2612                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 2613                                 if (ncp != NULL) {
 2614                                         ncn = ncp->nc_name;
 2615                                         for (i = 0; i < ncp->nc_nlen; i++)
 2616                                                 db_printf("%c", *ncn++);
 2617                                         vp = ncp->nc_dvp;
 2618                                 } else {
 2619                                         vp = NULL;
 2620                                 }
 2621                         }
 2622                 }
 2623                 db_printf("\n");
 2624         }
 2625 
 2626         return;
 2627 }
 2628 
 2629 DB_SHOW_COMMAND(vpath, db_show_vpath)
 2630 {
 2631         struct vnode *vp;
 2632 
 2633         if (!have_addr) {
 2634                 db_printf("usage: show vpath <struct vnode *>\n");
 2635                 return;
 2636         }
 2637 
 2638         vp = (struct vnode *)addr;
 2639         db_print_vpath(vp);
 2640 }
 2641 
 2642 #endif

Cache object: 59b1661a7436aa121fd21ac55f24362e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.