The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993, 1995
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Poul-Henning Kamp of the FreeBSD Project.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD: releng/12.0/sys/kern/vfs_cache.c 335437 2018-06-20 08:34:29Z bz $");
   39 
   40 #include "opt_ddb.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/counter.h>
   46 #include <sys/filedesc.h>
   47 #include <sys/fnv_hash.h>
   48 #include <sys/kernel.h>
   49 #include <sys/lock.h>
   50 #include <sys/malloc.h>
   51 #include <sys/fcntl.h>
   52 #include <sys/mount.h>
   53 #include <sys/namei.h>
   54 #include <sys/proc.h>
   55 #include <sys/rwlock.h>
   56 #include <sys/sdt.h>
   57 #include <sys/smp.h>
   58 #include <sys/syscallsubr.h>
   59 #include <sys/sysctl.h>
   60 #include <sys/sysproto.h>
   61 #include <sys/vnode.h>
   62 #ifdef KTRACE
   63 #include <sys/ktrace.h>
   64 #endif
   65 
   66 #ifdef DDB
   67 #include <ddb/ddb.h>
   68 #endif
   69 
   70 #include <vm/uma.h>
   71 
   72 SDT_PROVIDER_DECLARE(vfs);
   73 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
   74     "struct vnode *");
   75 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
   76     "char *");
   77 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
   78 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
   79     "char *", "struct vnode *");
   80 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
   81 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
   82     "struct vnode *", "char *");
   83 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
   84     "struct vnode *");
   85 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
   86     "struct vnode *", "char *");
   87 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
   88     "char *");
   89 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
   90 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
   91 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
   92 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
   93     "struct vnode *");
   94 SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *",
   95     "char *", "int");
   96 SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *",
   97     "char *", "int");
   98 
   99 /*
  100  * This structure describes the elements in the cache of recent
  101  * names looked up by namei.
  102  */
  103 
  104 struct  namecache {
  105         LIST_ENTRY(namecache) nc_hash;  /* hash chain */
  106         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  107         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  108         struct  vnode *nc_dvp;          /* vnode of parent of name */
  109         union {
  110                 struct  vnode *nu_vp;   /* vnode the name refers to */
  111                 u_int   nu_neghits;     /* negative entry hits */
  112         } n_un;
  113         u_char  nc_flag;                /* flag bits */
  114         u_char  nc_nlen;                /* length of name */
  115         char    nc_name[0];             /* segment name + nul */
  116 };
  117 
  118 /*
  119  * struct namecache_ts repeats struct namecache layout up to the
  120  * nc_nlen member.
  121  * struct namecache_ts is used in place of struct namecache when time(s) need
  122  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  123  * both a non-dotdot directory name plus dotdot for the directory's
  124  * parent.
  125  */
  126 struct  namecache_ts {
  127         struct  timespec nc_time;       /* timespec provided by fs */
  128         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
  129         int     nc_ticks;               /* ticks value when entry was added */
  130         struct namecache nc_nc;
  131 };
  132 
  133 #define nc_vp           n_un.nu_vp
  134 #define nc_neghits      n_un.nu_neghits
  135 
  136 /*
  137  * Flags in namecache.nc_flag
  138  */
  139 #define NCF_WHITE       0x01
  140 #define NCF_ISDOTDOT    0x02
  141 #define NCF_TS          0x04
  142 #define NCF_DTS         0x08
  143 #define NCF_DVDROP      0x10
  144 #define NCF_NEGATIVE    0x20
  145 #define NCF_HOTNEGATIVE 0x40
  146 
  147 /*
  148  * Name caching works as follows:
  149  *
  150  * Names found by directory scans are retained in a cache
  151  * for future reference.  It is managed LRU, so frequently
  152  * used names will hang around.  Cache is indexed by hash value
  153  * obtained from (vp, name) where vp refers to the directory
  154  * containing name.
  155  *
  156  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  157  * exist) the vnode pointer will be NULL.
  158  *
  159  * Upon reaching the last segment of a path, if the reference
  160  * is for DELETE, or NOCACHE is set (rewrite), and the
  161  * name is located in the cache, it will be dropped.
  162  *
  163  * These locks are used (in the order in which they can be taken):
  164  * NAME         TYPE    ROLE
  165  * vnodelock    mtx     vnode lists and v_cache_dd field protection
  166  * bucketlock   rwlock  for access to given set of hash buckets
  167  * neglist      mtx     negative entry LRU management
  168  *
  169  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
  170  * shrinking the LRU list.
  171  *
  172  * It is legal to take multiple vnodelock and bucketlock locks. The locking
  173  * order is lower address first. Both are recursive.
  174  *
  175  * "." lookups are lockless.
  176  *
  177  * ".." and vnode -> name lookups require vnodelock.
  178  *
  179  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
  180  *
  181  * Insertions and removals of entries require involved vnodes and bucketlocks
  182  * to be write-locked to prevent other threads from seeing the entry.
  183  *
  184  * Some lookups result in removal of the found entry (e.g. getting rid of a
  185  * negative entry with the intent to create a positive one), which poses a
  186  * problem when multiple threads reach the state. Similarly, two different
  187  * threads can purge two different vnodes and try to remove the same name.
  188  *
  189  * If the already held vnode lock is lower than the second required lock, we
  190  * can just take the other lock. However, in the opposite case, this could
  191  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
  192  * the first node, locking everything in order and revalidating the state.
  193  */
  194 
  195 /*
  196  * Structures associated with name caching.
  197  */
  198 #define NCHHASH(hash) \
  199         (&nchashtbl[(hash) & nchash])
  200 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
  201 static u_long __read_mostly     nchash;                 /* size of hash table */
  202 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  203     "Size of namecache hash table");
  204 static u_long __read_mostly     ncnegfactor = 12; /* ratio of negative entries */
  205 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
  206     "Ratio of negative namecache entries");
  207 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
  208 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
  209     "Number of negative entries in namecache");
  210 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
  211 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
  212     "Number of namecache entries");
  213 static u_long __exclusive_cache_line    numcachehv;/* number of cache entries with vnodes held */
  214 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
  215     "Number of namecache entries with vnodes held");
  216 u_int __read_mostly     ncsizefactor = 2;
  217 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
  218     "Size factor for namecache");
  219 static u_int __read_mostly      ncpurgeminvnodes;
  220 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
  221     "Number of vnodes below which purgevfs ignores the request");
  222 static u_int __read_mostly      ncneghitsrequeue = 8;
  223 SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0,
  224     "Number of hits to requeue a negative entry in the LRU list");
  225 
  226 struct nchstats nchstats;               /* cache effectiveness statistics */
  227 
  228 static struct mtx       ncneg_shrink_lock;
  229 static int      shrink_list_turn;
  230 
  231 struct neglist {
  232         struct mtx              nl_lock;
  233         TAILQ_HEAD(, namecache) nl_list;
  234 } __aligned(CACHE_LINE_SIZE);
  235 
  236 static struct neglist __read_mostly     *neglists;
  237 static struct neglist ncneg_hot;
  238 
  239 #define numneglists (ncneghash + 1)
  240 static u_int __read_mostly      ncneghash;
  241 static inline struct neglist *
  242 NCP2NEGLIST(struct namecache *ncp)
  243 {
  244 
  245         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
  246 }
  247 
  248 #define numbucketlocks (ncbuckethash + 1)
  249 static u_int __read_mostly  ncbuckethash;
  250 static struct rwlock_padalign __read_mostly  *bucketlocks;
  251 #define HASH2BUCKETLOCK(hash) \
  252         ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
  253 
  254 #define numvnodelocks (ncvnodehash + 1)
  255 static u_int __read_mostly  ncvnodehash;
  256 static struct mtx __read_mostly *vnodelocks;
  257 static inline struct mtx *
  258 VP2VNODELOCK(struct vnode *vp)
  259 {
  260 
  261         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
  262 }
  263 
  264 /*
  265  * UMA zones for the VFS cache.
  266  *
  267  * The small cache is used for entries with short names, which are the
  268  * most common.  The large cache is used for entries which are too big to
  269  * fit in the small cache.
  270  */
  271 static uma_zone_t __read_mostly cache_zone_small;
  272 static uma_zone_t __read_mostly cache_zone_small_ts;
  273 static uma_zone_t __read_mostly cache_zone_large;
  274 static uma_zone_t __read_mostly cache_zone_large_ts;
  275 
  276 #define CACHE_PATH_CUTOFF       35
  277 
  278 static struct namecache *
  279 cache_alloc(int len, int ts)
  280 {
  281         struct namecache_ts *ncp_ts;
  282         struct namecache *ncp;
  283 
  284         if (__predict_false(ts)) {
  285                 if (len <= CACHE_PATH_CUTOFF)
  286                         ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK);
  287                 else
  288                         ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK);
  289                 ncp = &ncp_ts->nc_nc;
  290         } else {
  291                 if (len <= CACHE_PATH_CUTOFF)
  292                         ncp = uma_zalloc(cache_zone_small, M_WAITOK);
  293                 else
  294                         ncp = uma_zalloc(cache_zone_large, M_WAITOK);
  295         }
  296         return (ncp);
  297 }
  298 
  299 static void
  300 cache_free(struct namecache *ncp)
  301 {
  302         struct namecache_ts *ncp_ts;
  303 
  304         if (ncp == NULL)
  305                 return;
  306         if ((ncp->nc_flag & NCF_DVDROP) != 0)
  307                 vdrop(ncp->nc_dvp);
  308         if (__predict_false(ncp->nc_flag & NCF_TS)) {
  309                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  310                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  311                         uma_zfree(cache_zone_small_ts, ncp_ts);
  312                 else
  313                         uma_zfree(cache_zone_large_ts, ncp_ts);
  314         } else {
  315                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
  316                         uma_zfree(cache_zone_small, ncp);
  317                 else
  318                         uma_zfree(cache_zone_large, ncp);
  319         }
  320 }
  321 
  322 static void
  323 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
  324 {
  325         struct namecache_ts *ncp_ts;
  326 
  327         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
  328             (tsp == NULL && ticksp == NULL),
  329             ("No NCF_TS"));
  330 
  331         if (tsp == NULL && ticksp == NULL)
  332                 return;
  333 
  334         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
  335         if (tsp != NULL)
  336                 *tsp = ncp_ts->nc_time;
  337         if (ticksp != NULL)
  338                 *ticksp = ncp_ts->nc_ticks;
  339 }
  340 
  341 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
  342 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
  343     "VFS namecache enabled");
  344 
  345 /* Export size information to userland */
  346 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
  347     sizeof(struct namecache), "sizeof(struct namecache)");
  348 
  349 /*
  350  * The new name cache statistics
  351  */
  352 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
  353     "Name cache statistics");
  354 #define STATNODE_ULONG(name, descr)     \
  355         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
  356 #define STATNODE_COUNTER(name, descr)   \
  357         static counter_u64_t __read_mostly name; \
  358         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
  359 STATNODE_ULONG(numneg, "Number of negative cache entries");
  360 STATNODE_ULONG(numcache, "Number of cache entries");
  361 STATNODE_COUNTER(numcalls, "Number of cache lookups");
  362 STATNODE_COUNTER(dothits, "Number of '.' hits");
  363 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
  364 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
  365 STATNODE_COUNTER(nummiss, "Number of cache misses");
  366 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
  367 STATNODE_COUNTER(numposzaps,
  368     "Number of cache hits (positive) we do not want to cache");
  369 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
  370 STATNODE_COUNTER(numnegzaps,
  371     "Number of cache hits (negative) we do not want to cache");
  372 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
  373 /* These count for kern___getcwd(), too. */
  374 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
  375 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
  376 STATNODE_COUNTER(numfullpathfail2,
  377     "Number of fullpath search errors (VOP_VPTOCNP failures)");
  378 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
  379 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
  380 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
  381     "Number of times zap_and_exit failed to lock");
  382 static long cache_lock_vnodes_cel_3_failures;
  383 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
  384     "Number of times 3-way vnode locking failed");
  385 
  386 static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
  387 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
  388     char *buf, char **retbuf, u_int buflen);
  389 
  390 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  391 
  392 static int cache_yield;
  393 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
  394     "Number of times cache called yield");
  395 
  396 static void
  397 cache_maybe_yield(void)
  398 {
  399 
  400         if (should_yield()) {
  401                 cache_yield++;
  402                 kern_yield(PRI_USER);
  403         }
  404 }
  405 
  406 static inline void
  407 cache_assert_vlp_locked(struct mtx *vlp)
  408 {
  409 
  410         if (vlp != NULL)
  411                 mtx_assert(vlp, MA_OWNED);
  412 }
  413 
  414 static inline void
  415 cache_assert_vnode_locked(struct vnode *vp)
  416 {
  417         struct mtx *vlp;
  418 
  419         vlp = VP2VNODELOCK(vp);
  420         cache_assert_vlp_locked(vlp);
  421 }
  422 
  423 static uint32_t
  424 cache_get_hash(char *name, u_char len, struct vnode *dvp)
  425 {
  426         uint32_t hash;
  427 
  428         hash = fnv_32_buf(name, len, FNV1_32_INIT);
  429         hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
  430         return (hash);
  431 }
  432 
  433 static inline struct rwlock *
  434 NCP2BUCKETLOCK(struct namecache *ncp)
  435 {
  436         uint32_t hash;
  437 
  438         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
  439         return (HASH2BUCKETLOCK(hash));
  440 }
  441 
  442 #ifdef INVARIANTS
  443 static void
  444 cache_assert_bucket_locked(struct namecache *ncp, int mode)
  445 {
  446         struct rwlock *blp;
  447 
  448         blp = NCP2BUCKETLOCK(ncp);
  449         rw_assert(blp, mode);
  450 }
  451 #else
  452 #define cache_assert_bucket_locked(x, y) do { } while (0)
  453 #endif
  454 
  455 #define cache_sort(x, y)        _cache_sort((void **)(x), (void **)(y))
  456 static void
  457 _cache_sort(void **p1, void **p2)
  458 {
  459         void *tmp;
  460 
  461         if (*p1 > *p2) {
  462                 tmp = *p2;
  463                 *p2 = *p1;
  464                 *p1 = tmp;
  465         }
  466 }
  467 
  468 static void
  469 cache_lock_all_buckets(void)
  470 {
  471         u_int i;
  472 
  473         for (i = 0; i < numbucketlocks; i++)
  474                 rw_wlock(&bucketlocks[i]);
  475 }
  476 
  477 static void
  478 cache_unlock_all_buckets(void)
  479 {
  480         u_int i;
  481 
  482         for (i = 0; i < numbucketlocks; i++)
  483                 rw_wunlock(&bucketlocks[i]);
  484 }
  485 
  486 static void
  487 cache_lock_all_vnodes(void)
  488 {
  489         u_int i;
  490 
  491         for (i = 0; i < numvnodelocks; i++)
  492                 mtx_lock(&vnodelocks[i]);
  493 }
  494 
  495 static void
  496 cache_unlock_all_vnodes(void)
  497 {
  498         u_int i;
  499 
  500         for (i = 0; i < numvnodelocks; i++)
  501                 mtx_unlock(&vnodelocks[i]);
  502 }
  503 
  504 static int
  505 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  506 {
  507 
  508         cache_sort(&vlp1, &vlp2);
  509         MPASS(vlp2 != NULL);
  510 
  511         if (vlp1 != NULL) {
  512                 if (!mtx_trylock(vlp1))
  513                         return (EAGAIN);
  514         }
  515         if (!mtx_trylock(vlp2)) {
  516                 if (vlp1 != NULL)
  517                         mtx_unlock(vlp1);
  518                 return (EAGAIN);
  519         }
  520 
  521         return (0);
  522 }
  523 
  524 static void
  525 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  526 {
  527 
  528         MPASS(vlp1 != NULL || vlp2 != NULL);
  529 
  530         if (vlp1 != NULL)
  531                 mtx_unlock(vlp1);
  532         if (vlp2 != NULL)
  533                 mtx_unlock(vlp2);
  534 }
  535 
  536 static int
  537 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  538 {
  539         struct nchstats snap;
  540 
  541         if (req->oldptr == NULL)
  542                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
  543 
  544         snap = nchstats;
  545         snap.ncs_goodhits = counter_u64_fetch(numposhits);
  546         snap.ncs_neghits = counter_u64_fetch(numneghits);
  547         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
  548             counter_u64_fetch(numnegzaps);
  549         snap.ncs_miss = counter_u64_fetch(nummisszap) +
  550             counter_u64_fetch(nummiss);
  551 
  552         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
  553 }
  554 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
  555     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
  556     "VFS cache effectiveness statistics");
  557 
  558 #ifdef DIAGNOSTIC
  559 /*
  560  * Grab an atomic snapshot of the name cache hash chain lengths
  561  */
  562 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
  563     "hash table stats");
  564 
  565 static int
  566 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
  567 {
  568         struct nchashhead *ncpp;
  569         struct namecache *ncp;
  570         int i, error, n_nchash, *cntbuf;
  571 
  572 retry:
  573         n_nchash = nchash + 1;  /* nchash is max index, not count */
  574         if (req->oldptr == NULL)
  575                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
  576         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
  577         cache_lock_all_buckets();
  578         if (n_nchash != nchash + 1) {
  579                 cache_unlock_all_buckets();
  580                 free(cntbuf, M_TEMP);
  581                 goto retry;
  582         }
  583         /* Scan hash tables counting entries */
  584         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
  585                 LIST_FOREACH(ncp, ncpp, nc_hash)
  586                         cntbuf[i]++;
  587         cache_unlock_all_buckets();
  588         for (error = 0, i = 0; i < n_nchash; i++)
  589                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
  590                         break;
  591         free(cntbuf, M_TEMP);
  592         return (error);
  593 }
  594 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
  595     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
  596     "nchash chain lengths");
  597 
  598 static int
  599 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
  600 {
  601         int error;
  602         struct nchashhead *ncpp;
  603         struct namecache *ncp;
  604         int n_nchash;
  605         int count, maxlength, used, pct;
  606 
  607         if (!req->oldptr)
  608                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
  609 
  610         cache_lock_all_buckets();
  611         n_nchash = nchash + 1;  /* nchash is max index, not count */
  612         used = 0;
  613         maxlength = 0;
  614 
  615         /* Scan hash tables for applicable entries */
  616         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
  617                 count = 0;
  618                 LIST_FOREACH(ncp, ncpp, nc_hash) {
  619                         count++;
  620                 }
  621                 if (count)
  622                         used++;
  623                 if (maxlength < count)
  624                         maxlength = count;
  625         }
  626         n_nchash = nchash + 1;
  627         cache_unlock_all_buckets();
  628         pct = (used * 100) / (n_nchash / 100);
  629         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
  630         if (error)
  631                 return (error);
  632         error = SYSCTL_OUT(req, &used, sizeof(used));
  633         if (error)
  634                 return (error);
  635         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
  636         if (error)
  637                 return (error);
  638         error = SYSCTL_OUT(req, &pct, sizeof(pct));
  639         if (error)
  640                 return (error);
  641         return (0);
  642 }
  643 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
  644     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
  645     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
  646 #endif
  647 
  648 /*
  649  * Negative entries management
  650  *
  651  * A variation of LRU scheme is used. New entries are hashed into one of
  652  * numneglists cold lists. Entries get promoted to the hot list on first hit.
  653  * Partial LRU for the hot list is maintained by requeueing them every
  654  * ncneghitsrequeue hits.
  655  *
  656  * The shrinker will demote hot list head and evict from the cold list in a
  657  * round-robin manner.
  658  */
  659 static void
  660 cache_negative_hit(struct namecache *ncp)
  661 {
  662         struct neglist *neglist;
  663         u_int hits;
  664 
  665         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  666         hits = atomic_fetchadd_int(&ncp->nc_neghits, 1);
  667         if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  668                 if ((hits % ncneghitsrequeue) != 0)
  669                         return;
  670                 mtx_lock(&ncneg_hot.nl_lock);
  671                 if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  672                         TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  673                         TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
  674                         mtx_unlock(&ncneg_hot.nl_lock);
  675                         return;
  676                 }
  677                 /*
  678                  * The shrinker cleared the flag and removed the entry from
  679                  * the hot list. Put it back.
  680                  */
  681         } else {
  682                 mtx_lock(&ncneg_hot.nl_lock);
  683         }
  684         neglist = NCP2NEGLIST(ncp);
  685         mtx_lock(&neglist->nl_lock);
  686         if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
  687                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
  688                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
  689                 ncp->nc_flag |= NCF_HOTNEGATIVE;
  690         }
  691         mtx_unlock(&neglist->nl_lock);
  692         mtx_unlock(&ncneg_hot.nl_lock);
  693 }
  694 
  695 static void
  696 cache_negative_insert(struct namecache *ncp, bool neg_locked)
  697 {
  698         struct neglist *neglist;
  699 
  700         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  701         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  702         neglist = NCP2NEGLIST(ncp);
  703         if (!neg_locked) {
  704                 mtx_lock(&neglist->nl_lock);
  705         } else {
  706                 mtx_assert(&neglist->nl_lock, MA_OWNED);
  707         }
  708         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
  709         if (!neg_locked)
  710                 mtx_unlock(&neglist->nl_lock);
  711         atomic_add_rel_long(&numneg, 1);
  712 }
  713 
  714 static void
  715 cache_negative_remove(struct namecache *ncp, bool neg_locked)
  716 {
  717         struct neglist *neglist;
  718         bool hot_locked = false;
  719         bool list_locked = false;
  720 
  721         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  722         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  723         neglist = NCP2NEGLIST(ncp);
  724         if (!neg_locked) {
  725                 if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  726                         hot_locked = true;
  727                         mtx_lock(&ncneg_hot.nl_lock);
  728                         if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
  729                                 list_locked = true;
  730                                 mtx_lock(&neglist->nl_lock);
  731                         }
  732                 } else {
  733                         list_locked = true;
  734                         mtx_lock(&neglist->nl_lock);
  735                 }
  736         }
  737         if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  738                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
  739                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  740         } else {
  741                 mtx_assert(&neglist->nl_lock, MA_OWNED);
  742                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
  743         }
  744         if (list_locked)
  745                 mtx_unlock(&neglist->nl_lock);
  746         if (hot_locked)
  747                 mtx_unlock(&ncneg_hot.nl_lock);
  748         atomic_subtract_rel_long(&numneg, 1);
  749 }
  750 
  751 static void
  752 cache_negative_shrink_select(int start, struct namecache **ncpp,
  753     struct neglist **neglistpp)
  754 {
  755         struct neglist *neglist;
  756         struct namecache *ncp;
  757         int i;
  758 
  759         *ncpp = ncp = NULL;
  760         neglist = NULL;
  761 
  762         for (i = start; i < numneglists; i++) {
  763                 neglist = &neglists[i];
  764                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
  765                         continue;
  766                 mtx_lock(&neglist->nl_lock);
  767                 ncp = TAILQ_FIRST(&neglist->nl_list);
  768                 if (ncp != NULL)
  769                         break;
  770                 mtx_unlock(&neglist->nl_lock);
  771         }
  772 
  773         *neglistpp = neglist;
  774         *ncpp = ncp;
  775 }
  776 
  777 static void
  778 cache_negative_zap_one(void)
  779 {
  780         struct namecache *ncp, *ncp2;
  781         struct neglist *neglist;
  782         struct mtx *dvlp;
  783         struct rwlock *blp;
  784 
  785         if (!mtx_trylock(&ncneg_shrink_lock))
  786                 return;
  787 
  788         mtx_lock(&ncneg_hot.nl_lock);
  789         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
  790         if (ncp != NULL) {
  791                 neglist = NCP2NEGLIST(ncp);
  792                 mtx_lock(&neglist->nl_lock);
  793                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  794                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
  795                 ncp->nc_flag &= ~NCF_HOTNEGATIVE;
  796                 mtx_unlock(&neglist->nl_lock);
  797         }
  798 
  799         cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
  800         shrink_list_turn++;
  801         if (shrink_list_turn == numneglists)
  802                 shrink_list_turn = 0;
  803         if (ncp == NULL && shrink_list_turn == 0)
  804                 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
  805         if (ncp == NULL) {
  806                 mtx_unlock(&ncneg_hot.nl_lock);
  807                 goto out;
  808         }
  809 
  810         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  811         dvlp = VP2VNODELOCK(ncp->nc_dvp);
  812         blp = NCP2BUCKETLOCK(ncp);
  813         mtx_unlock(&neglist->nl_lock);
  814         mtx_unlock(&ncneg_hot.nl_lock);
  815         mtx_lock(dvlp);
  816         rw_wlock(blp);
  817         mtx_lock(&neglist->nl_lock);
  818         ncp2 = TAILQ_FIRST(&neglist->nl_list);
  819         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
  820             blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
  821                 ncp = NULL;
  822                 goto out_unlock_all;
  823         }
  824         SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
  825             ncp->nc_name, ncp->nc_neghits);
  826 
  827         cache_zap_locked(ncp, true);
  828 out_unlock_all:
  829         mtx_unlock(&neglist->nl_lock);
  830         rw_wunlock(blp);
  831         mtx_unlock(dvlp);
  832 out:
  833         mtx_unlock(&ncneg_shrink_lock);
  834         cache_free(ncp);
  835 }
  836 
  837 /*
  838  * cache_zap_locked():
  839  *
  840  *   Removes a namecache entry from cache, whether it contains an actual
  841  *   pointer to a vnode or if it is just a negative cache entry.
  842  */
  843 static void
  844 cache_zap_locked(struct namecache *ncp, bool neg_locked)
  845 {
  846 
  847         if (!(ncp->nc_flag & NCF_NEGATIVE))
  848                 cache_assert_vnode_locked(ncp->nc_vp);
  849         cache_assert_vnode_locked(ncp->nc_dvp);
  850         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  851 
  852         CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
  853             (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
  854         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
  855                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
  856                     ncp->nc_name, ncp->nc_vp);
  857         } else {
  858                 SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp,
  859                     ncp->nc_name, ncp->nc_neghits);
  860         }
  861         LIST_REMOVE(ncp, nc_hash);
  862         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
  863                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
  864                 if (ncp == ncp->nc_vp->v_cache_dd)
  865                         ncp->nc_vp->v_cache_dd = NULL;
  866         } else {
  867                 cache_negative_remove(ncp, neg_locked);
  868         }
  869         if (ncp->nc_flag & NCF_ISDOTDOT) {
  870                 if (ncp == ncp->nc_dvp->v_cache_dd)
  871                         ncp->nc_dvp->v_cache_dd = NULL;
  872         } else {
  873                 LIST_REMOVE(ncp, nc_src);
  874                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
  875                         ncp->nc_flag |= NCF_DVDROP;
  876                         atomic_subtract_rel_long(&numcachehv, 1);
  877                 }
  878         }
  879         atomic_subtract_rel_long(&numcache, 1);
  880 }
  881 
  882 static void
  883 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
  884 {
  885         struct rwlock *blp;
  886 
  887         MPASS(ncp->nc_dvp == vp);
  888         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  889         cache_assert_vnode_locked(vp);
  890 
  891         blp = NCP2BUCKETLOCK(ncp);
  892         rw_wlock(blp);
  893         cache_zap_locked(ncp, false);
  894         rw_wunlock(blp);
  895 }
  896 
  897 static bool
  898 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
  899     struct mtx **vlpp)
  900 {
  901         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
  902         struct rwlock *blp;
  903 
  904         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
  905         cache_assert_vnode_locked(vp);
  906 
  907         if (ncp->nc_flag & NCF_NEGATIVE) {
  908                 if (*vlpp != NULL) {
  909                         mtx_unlock(*vlpp);
  910                         *vlpp = NULL;
  911                 }
  912                 cache_zap_negative_locked_vnode_kl(ncp, vp);
  913                 return (true);
  914         }
  915 
  916         pvlp = VP2VNODELOCK(vp);
  917         blp = NCP2BUCKETLOCK(ncp);
  918         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
  919         vlp2 = VP2VNODELOCK(ncp->nc_vp);
  920 
  921         if (*vlpp == vlp1 || *vlpp == vlp2) {
  922                 to_unlock = *vlpp;
  923                 *vlpp = NULL;
  924         } else {
  925                 if (*vlpp != NULL) {
  926                         mtx_unlock(*vlpp);
  927                         *vlpp = NULL;
  928                 }
  929                 cache_sort(&vlp1, &vlp2);
  930                 if (vlp1 == pvlp) {
  931                         mtx_lock(vlp2);
  932                         to_unlock = vlp2;
  933                 } else {
  934                         if (!mtx_trylock(vlp1))
  935                                 goto out_relock;
  936                         to_unlock = vlp1;
  937                 }
  938         }
  939         rw_wlock(blp);
  940         cache_zap_locked(ncp, false);
  941         rw_wunlock(blp);
  942         if (to_unlock != NULL)
  943                 mtx_unlock(to_unlock);
  944         return (true);
  945 
  946 out_relock:
  947         mtx_unlock(vlp2);
  948         mtx_lock(vlp1);
  949         mtx_lock(vlp2);
  950         MPASS(*vlpp == NULL);
  951         *vlpp = vlp1;
  952         return (false);
  953 }
  954 
  955 static int
  956 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
  957 {
  958         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
  959         struct rwlock *blp;
  960         int error = 0;
  961 
  962         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
  963         cache_assert_vnode_locked(vp);
  964 
  965         pvlp = VP2VNODELOCK(vp);
  966         if (ncp->nc_flag & NCF_NEGATIVE) {
  967                 cache_zap_negative_locked_vnode_kl(ncp, vp);
  968                 goto out;
  969         }
  970 
  971         blp = NCP2BUCKETLOCK(ncp);
  972         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
  973         vlp2 = VP2VNODELOCK(ncp->nc_vp);
  974         cache_sort(&vlp1, &vlp2);
  975         if (vlp1 == pvlp) {
  976                 mtx_lock(vlp2);
  977                 to_unlock = vlp2;
  978         } else {
  979                 if (!mtx_trylock(vlp1)) {
  980                         error = EAGAIN;
  981                         goto out;
  982                 }
  983                 to_unlock = vlp1;
  984         }
  985         rw_wlock(blp);
  986         cache_zap_locked(ncp, false);
  987         rw_wunlock(blp);
  988         mtx_unlock(to_unlock);
  989 out:
  990         mtx_unlock(pvlp);
  991         return (error);
  992 }
  993 
  994 static int
  995 cache_zap_wlocked_bucket(struct namecache *ncp, struct rwlock *blp)
  996 {
  997         struct mtx *dvlp, *vlp;
  998 
  999         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 1000 
 1001         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1002         vlp = NULL;
 1003         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1004                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1005         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1006                 cache_zap_locked(ncp, false);
 1007                 rw_wunlock(blp);
 1008                 cache_unlock_vnodes(dvlp, vlp);
 1009                 return (0);
 1010         }
 1011 
 1012         rw_wunlock(blp);
 1013         return (EAGAIN);
 1014 }
 1015 
 1016 static int
 1017 cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp)
 1018 {
 1019         struct mtx *dvlp, *vlp;
 1020 
 1021         cache_assert_bucket_locked(ncp, RA_RLOCKED);
 1022 
 1023         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1024         vlp = NULL;
 1025         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1026                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1027         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1028                 rw_runlock(blp);
 1029                 rw_wlock(blp);
 1030                 cache_zap_locked(ncp, false);
 1031                 rw_wunlock(blp);
 1032                 cache_unlock_vnodes(dvlp, vlp);
 1033                 return (0);
 1034         }
 1035 
 1036         rw_runlock(blp);
 1037         return (EAGAIN);
 1038 }
 1039 
 1040 static int
 1041 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
 1042     struct mtx **vlpp1, struct mtx **vlpp2)
 1043 {
 1044         struct mtx *dvlp, *vlp;
 1045 
 1046         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 1047 
 1048         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1049         vlp = NULL;
 1050         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1051                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1052         cache_sort(&dvlp, &vlp);
 1053 
 1054         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
 1055                 cache_zap_locked(ncp, false);
 1056                 cache_unlock_vnodes(dvlp, vlp);
 1057                 *vlpp1 = NULL;
 1058                 *vlpp2 = NULL;
 1059                 return (0);
 1060         }
 1061 
 1062         if (*vlpp1 != NULL)
 1063                 mtx_unlock(*vlpp1);
 1064         if (*vlpp2 != NULL)
 1065                 mtx_unlock(*vlpp2);
 1066         *vlpp1 = NULL;
 1067         *vlpp2 = NULL;
 1068 
 1069         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1070                 cache_zap_locked(ncp, false);
 1071                 cache_unlock_vnodes(dvlp, vlp);
 1072                 return (0);
 1073         }
 1074 
 1075         rw_wunlock(blp);
 1076         *vlpp1 = dvlp;
 1077         *vlpp2 = vlp;
 1078         if (*vlpp1 != NULL)
 1079                 mtx_lock(*vlpp1);
 1080         mtx_lock(*vlpp2);
 1081         rw_wlock(blp);
 1082         return (EAGAIN);
 1083 }
 1084 
 1085 static void
 1086 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
 1087 {
 1088 
 1089         if (blp != NULL) {
 1090                 rw_runlock(blp);
 1091         } else {
 1092                 mtx_unlock(vlp);
 1093         }
 1094 }
 1095 
 1096 static int __noinline
 1097 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1098     struct timespec *tsp, int *ticksp)
 1099 {
 1100         int ltype;
 1101 
 1102         *vpp = dvp;
 1103         CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 1104                         dvp, cnp->cn_nameptr);
 1105         counter_u64_add(dothits, 1);
 1106         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 1107         if (tsp != NULL)
 1108                 timespecclear(tsp);
 1109         if (ticksp != NULL)
 1110                 *ticksp = ticks;
 1111         vrefact(*vpp);
 1112         /*
 1113          * When we lookup "." we still can be asked to lock it
 1114          * differently...
 1115          */
 1116         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 1117         if (ltype != VOP_ISLOCKED(*vpp)) {
 1118                 if (ltype == LK_EXCLUSIVE) {
 1119                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 1120                         if ((*vpp)->v_iflag & VI_DOOMED) {
 1121                                 /* forced unmount */
 1122                                 vrele(*vpp);
 1123                                 *vpp = NULL;
 1124                                 return (ENOENT);
 1125                         }
 1126                 } else
 1127                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 1128         }
 1129         return (-1);
 1130 }
 1131 
 1132 /*
 1133  * Lookup an entry in the cache
 1134  *
 1135  * Lookup is called with dvp pointing to the directory to search,
 1136  * cnp pointing to the name of the entry being sought. If the lookup
 1137  * succeeds, the vnode is returned in *vpp, and a status of -1 is
 1138  * returned. If the lookup determines that the name does not exist
 1139  * (negative caching), a status of ENOENT is returned. If the lookup
 1140  * fails, a status of zero is returned.  If the directory vnode is
 1141  * recycled out from under us due to a forced unmount, a status of
 1142  * ENOENT is returned.
 1143  *
 1144  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
 1145  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
 1146  * not recursively acquired.
 1147  */
 1148 
 1149 static __noinline int
 1150 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
 1151     struct componentname *cnp, struct timespec *tsp, int *ticksp)
 1152 {
 1153         struct namecache *ncp;
 1154         struct rwlock *blp;
 1155         struct mtx *dvlp, *dvlp2;
 1156         uint32_t hash;
 1157         int error;
 1158 
 1159         if (cnp->cn_namelen == 2 &&
 1160             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1161                 counter_u64_add(dotdothits, 1);
 1162                 dvlp = VP2VNODELOCK(dvp);
 1163                 dvlp2 = NULL;
 1164                 mtx_lock(dvlp);
 1165 retry_dotdot:
 1166                 ncp = dvp->v_cache_dd;
 1167                 if (ncp == NULL) {
 1168                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 1169                             "..", NULL);
 1170                         mtx_unlock(dvlp);
 1171                         if (dvlp2 != NULL)
 1172                                 mtx_unlock(dvlp2);
 1173                         return (0);
 1174                 }
 1175                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1176                         if (ncp->nc_dvp != dvp)
 1177                                 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
 1178                         if (!cache_zap_locked_vnode_kl2(ncp,
 1179                             dvp, &dvlp2))
 1180                                 goto retry_dotdot;
 1181                         MPASS(dvp->v_cache_dd == NULL);
 1182                         mtx_unlock(dvlp);
 1183                         if (dvlp2 != NULL)
 1184                                 mtx_unlock(dvlp2);
 1185                         cache_free(ncp);
 1186                 } else {
 1187                         dvp->v_cache_dd = NULL;
 1188                         mtx_unlock(dvlp);
 1189                         if (dvlp2 != NULL)
 1190                                 mtx_unlock(dvlp2);
 1191                 }
 1192                 return (0);
 1193         }
 1194 
 1195         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1196         blp = HASH2BUCKETLOCK(hash);
 1197 retry:
 1198         if (LIST_EMPTY(NCHHASH(hash)))
 1199                 goto out_no_entry;
 1200 
 1201         rw_wlock(blp);
 1202 
 1203         LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1204                 counter_u64_add(numchecks, 1);
 1205                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1206                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1207                         break;
 1208         }
 1209 
 1210         /* We failed to find an entry */
 1211         if (ncp == NULL) {
 1212                 rw_wunlock(blp);
 1213                 goto out_no_entry;
 1214         }
 1215 
 1216         counter_u64_add(numposzaps, 1);
 1217 
 1218         error = cache_zap_wlocked_bucket(ncp, blp);
 1219         if (error != 0) {
 1220                 zap_and_exit_bucket_fail++;
 1221                 cache_maybe_yield();
 1222                 goto retry;
 1223         }
 1224         cache_free(ncp);
 1225         return (0);
 1226 out_no_entry:
 1227         SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
 1228         counter_u64_add(nummisszap, 1);
 1229         return (0);
 1230 }
 1231 
 1232 int
 1233 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1234     struct timespec *tsp, int *ticksp)
 1235 {
 1236         struct namecache_ts *ncp_ts;
 1237         struct namecache *ncp;
 1238         struct rwlock *blp;
 1239         struct mtx *dvlp;
 1240         uint32_t hash;
 1241         int error, ltype;
 1242 
 1243         if (__predict_false(!doingcache)) {
 1244                 cnp->cn_flags &= ~MAKEENTRY;
 1245                 return (0);
 1246         }
 1247 
 1248         counter_u64_add(numcalls, 1);
 1249 
 1250         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
 1251                 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 1252 
 1253         if ((cnp->cn_flags & MAKEENTRY) == 0)
 1254                 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
 1255 
 1256 retry:
 1257         blp = NULL;
 1258         dvlp = NULL;
 1259         error = 0;
 1260         if (cnp->cn_namelen == 2 &&
 1261             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 1262                 counter_u64_add(dotdothits, 1);
 1263                 dvlp = VP2VNODELOCK(dvp);
 1264                 mtx_lock(dvlp);
 1265                 ncp = dvp->v_cache_dd;
 1266                 if (ncp == NULL) {
 1267                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 1268                             "..", NULL);
 1269                         mtx_unlock(dvlp);
 1270                         return (0);
 1271                 }
 1272                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1273                         if (ncp->nc_flag & NCF_NEGATIVE)
 1274                                 *vpp = NULL;
 1275                         else
 1276                                 *vpp = ncp->nc_vp;
 1277                 } else
 1278                         *vpp = ncp->nc_dvp;
 1279                 /* Return failure if negative entry was found. */
 1280                 if (*vpp == NULL)
 1281                         goto negative_success;
 1282                 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 1283                     dvp, cnp->cn_nameptr, *vpp);
 1284                 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
 1285                     *vpp);
 1286                 cache_out_ts(ncp, tsp, ticksp);
 1287                 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 1288                     NCF_DTS && tsp != NULL) {
 1289                         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1290                         *tsp = ncp_ts->nc_dotdottime;
 1291                 }
 1292                 goto success;
 1293         }
 1294 
 1295         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1296         blp = HASH2BUCKETLOCK(hash);
 1297         rw_rlock(blp);
 1298 
 1299         LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1300                 counter_u64_add(numchecks, 1);
 1301                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1302                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 1303                         break;
 1304         }
 1305 
 1306         /* We failed to find an entry */
 1307         if (ncp == NULL) {
 1308                 rw_runlock(blp);
 1309                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 1310                     NULL);
 1311                 counter_u64_add(nummiss, 1);
 1312                 return (0);
 1313         }
 1314 
 1315         /* We found a "positive" match, return the vnode */
 1316         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1317                 counter_u64_add(numposhits, 1);
 1318                 *vpp = ncp->nc_vp;
 1319                 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 1320                     dvp, cnp->cn_nameptr, *vpp, ncp);
 1321                 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
 1322                     *vpp);
 1323                 cache_out_ts(ncp, tsp, ticksp);
 1324                 goto success;
 1325         }
 1326 
 1327 negative_success:
 1328         /* We found a negative match, and want to create it, so purge */
 1329         if (cnp->cn_nameiop == CREATE) {
 1330                 counter_u64_add(numnegzaps, 1);
 1331                 goto zap_and_exit;
 1332         }
 1333 
 1334         counter_u64_add(numneghits, 1);
 1335         cache_negative_hit(ncp);
 1336         if (ncp->nc_flag & NCF_WHITE)
 1337                 cnp->cn_flags |= ISWHITEOUT;
 1338         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 1339             ncp->nc_name);
 1340         cache_out_ts(ncp, tsp, ticksp);
 1341         cache_lookup_unlock(blp, dvlp);
 1342         return (ENOENT);
 1343 
 1344 success:
 1345         /*
 1346          * On success we return a locked and ref'd vnode as per the lookup
 1347          * protocol.
 1348          */
 1349         MPASS(dvp != *vpp);
 1350         ltype = 0;      /* silence gcc warning */
 1351         if (cnp->cn_flags & ISDOTDOT) {
 1352                 ltype = VOP_ISLOCKED(dvp);
 1353                 VOP_UNLOCK(dvp, 0);
 1354         }
 1355         vhold(*vpp);
 1356         cache_lookup_unlock(blp, dvlp);
 1357         error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
 1358         if (cnp->cn_flags & ISDOTDOT) {
 1359                 vn_lock(dvp, ltype | LK_RETRY);
 1360                 if (dvp->v_iflag & VI_DOOMED) {
 1361                         if (error == 0)
 1362                                 vput(*vpp);
 1363                         *vpp = NULL;
 1364                         return (ENOENT);
 1365                 }
 1366         }
 1367         if (error) {
 1368                 *vpp = NULL;
 1369                 goto retry;
 1370         }
 1371         if ((cnp->cn_flags & ISLASTCN) &&
 1372             (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
 1373                 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 1374         }
 1375         return (-1);
 1376 
 1377 zap_and_exit:
 1378         if (blp != NULL)
 1379                 error = cache_zap_rlocked_bucket(ncp, blp);
 1380         else
 1381                 error = cache_zap_locked_vnode(ncp, dvp);
 1382         if (error != 0) {
 1383                 zap_and_exit_bucket_fail++;
 1384                 cache_maybe_yield();
 1385                 goto retry;
 1386         }
 1387         cache_free(ncp);
 1388         return (0);
 1389 }
 1390 
 1391 struct celockstate {
 1392         struct mtx *vlp[3];
 1393         struct rwlock *blp[2];
 1394 };
 1395 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 1396 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 1397 
 1398 static inline void
 1399 cache_celockstate_init(struct celockstate *cel)
 1400 {
 1401 
 1402         bzero(cel, sizeof(*cel));
 1403 }
 1404 
 1405 static void
 1406 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
 1407     struct vnode *dvp)
 1408 {
 1409         struct mtx *vlp1, *vlp2;
 1410 
 1411         MPASS(cel->vlp[0] == NULL);
 1412         MPASS(cel->vlp[1] == NULL);
 1413         MPASS(cel->vlp[2] == NULL);
 1414 
 1415         MPASS(vp != NULL || dvp != NULL);
 1416 
 1417         vlp1 = VP2VNODELOCK(vp);
 1418         vlp2 = VP2VNODELOCK(dvp);
 1419         cache_sort(&vlp1, &vlp2);
 1420 
 1421         if (vlp1 != NULL) {
 1422                 mtx_lock(vlp1);
 1423                 cel->vlp[0] = vlp1;
 1424         }
 1425         mtx_lock(vlp2);
 1426         cel->vlp[1] = vlp2;
 1427 }
 1428 
 1429 static void
 1430 cache_unlock_vnodes_cel(struct celockstate *cel)
 1431 {
 1432 
 1433         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 1434 
 1435         if (cel->vlp[0] != NULL)
 1436                 mtx_unlock(cel->vlp[0]);
 1437         if (cel->vlp[1] != NULL)
 1438                 mtx_unlock(cel->vlp[1]);
 1439         if (cel->vlp[2] != NULL)
 1440                 mtx_unlock(cel->vlp[2]);
 1441 }
 1442 
 1443 static bool
 1444 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 1445 {
 1446         struct mtx *vlp;
 1447         bool ret;
 1448 
 1449         cache_assert_vlp_locked(cel->vlp[0]);
 1450         cache_assert_vlp_locked(cel->vlp[1]);
 1451         MPASS(cel->vlp[2] == NULL);
 1452 
 1453         MPASS(vp != NULL);
 1454         vlp = VP2VNODELOCK(vp);
 1455 
 1456         ret = true;
 1457         if (vlp >= cel->vlp[1]) {
 1458                 mtx_lock(vlp);
 1459         } else {
 1460                 if (mtx_trylock(vlp))
 1461                         goto out;
 1462                 cache_lock_vnodes_cel_3_failures++;
 1463                 cache_unlock_vnodes_cel(cel);
 1464                 if (vlp < cel->vlp[0]) {
 1465                         mtx_lock(vlp);
 1466                         mtx_lock(cel->vlp[0]);
 1467                         mtx_lock(cel->vlp[1]);
 1468                 } else {
 1469                         if (cel->vlp[0] != NULL)
 1470                                 mtx_lock(cel->vlp[0]);
 1471                         mtx_lock(vlp);
 1472                         mtx_lock(cel->vlp[1]);
 1473                 }
 1474                 ret = false;
 1475         }
 1476 out:
 1477         cel->vlp[2] = vlp;
 1478         return (ret);
 1479 }
 1480 
 1481 static void
 1482 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
 1483     struct rwlock *blp2)
 1484 {
 1485 
 1486         MPASS(cel->blp[0] == NULL);
 1487         MPASS(cel->blp[1] == NULL);
 1488 
 1489         cache_sort(&blp1, &blp2);
 1490 
 1491         if (blp1 != NULL) {
 1492                 rw_wlock(blp1);
 1493                 cel->blp[0] = blp1;
 1494         }
 1495         rw_wlock(blp2);
 1496         cel->blp[1] = blp2;
 1497 }
 1498 
 1499 static void
 1500 cache_unlock_buckets_cel(struct celockstate *cel)
 1501 {
 1502 
 1503         if (cel->blp[0] != NULL)
 1504                 rw_wunlock(cel->blp[0]);
 1505         rw_wunlock(cel->blp[1]);
 1506 }
 1507 
 1508 /*
 1509  * Lock part of the cache affected by the insertion.
 1510  *
 1511  * This means vnodelocks for dvp, vp and the relevant bucketlock.
 1512  * However, insertion can result in removal of an old entry. In this
 1513  * case we have an additional vnode and bucketlock pair to lock. If the
 1514  * entry is negative, ncelock is locked instead of the vnode.
 1515  *
 1516  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
 1517  * preserving the locking order (smaller address first).
 1518  */
 1519 static void
 1520 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 1521     uint32_t hash)
 1522 {
 1523         struct namecache *ncp;
 1524         struct rwlock *blps[2];
 1525 
 1526         blps[0] = HASH2BUCKETLOCK(hash);
 1527         for (;;) {
 1528                 blps[1] = NULL;
 1529                 cache_lock_vnodes_cel(cel, dvp, vp);
 1530                 if (vp == NULL || vp->v_type != VDIR)
 1531                         break;
 1532                 ncp = vp->v_cache_dd;
 1533                 if (ncp == NULL)
 1534                         break;
 1535                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 1536                         break;
 1537                 MPASS(ncp->nc_dvp == vp);
 1538                 blps[1] = NCP2BUCKETLOCK(ncp);
 1539                 if (ncp->nc_flag & NCF_NEGATIVE)
 1540                         break;
 1541                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 1542                         break;
 1543                 /*
 1544                  * All vnodes got re-locked. Re-validate the state and if
 1545                  * nothing changed we are done. Otherwise restart.
 1546                  */
 1547                 if (ncp == vp->v_cache_dd &&
 1548                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 1549                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 1550                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 1551                         break;
 1552                 cache_unlock_vnodes_cel(cel);
 1553                 cel->vlp[0] = NULL;
 1554                 cel->vlp[1] = NULL;
 1555                 cel->vlp[2] = NULL;
 1556         }
 1557         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 1558 }
 1559 
 1560 static void
 1561 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 1562     uint32_t hash)
 1563 {
 1564         struct namecache *ncp;
 1565         struct rwlock *blps[2];
 1566 
 1567         blps[0] = HASH2BUCKETLOCK(hash);
 1568         for (;;) {
 1569                 blps[1] = NULL;
 1570                 cache_lock_vnodes_cel(cel, dvp, vp);
 1571                 ncp = dvp->v_cache_dd;
 1572                 if (ncp == NULL)
 1573                         break;
 1574                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 1575                         break;
 1576                 MPASS(ncp->nc_dvp == dvp);
 1577                 blps[1] = NCP2BUCKETLOCK(ncp);
 1578                 if (ncp->nc_flag & NCF_NEGATIVE)
 1579                         break;
 1580                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 1581                         break;
 1582                 if (ncp == dvp->v_cache_dd &&
 1583                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 1584                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 1585                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 1586                         break;
 1587                 cache_unlock_vnodes_cel(cel);
 1588                 cel->vlp[0] = NULL;
 1589                 cel->vlp[1] = NULL;
 1590                 cel->vlp[2] = NULL;
 1591         }
 1592         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 1593 }
 1594 
 1595 static void
 1596 cache_enter_unlock(struct celockstate *cel)
 1597 {
 1598 
 1599         cache_unlock_buckets_cel(cel);
 1600         cache_unlock_vnodes_cel(cel);
 1601 }
 1602 
 1603 /*
 1604  * Add an entry to the cache.
 1605  */
 1606 void
 1607 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 1608     struct timespec *tsp, struct timespec *dtsp)
 1609 {
 1610         struct celockstate cel;
 1611         struct namecache *ncp, *n2, *ndd;
 1612         struct namecache_ts *ncp_ts, *n2_ts;
 1613         struct nchashhead *ncpp;
 1614         struct neglist *neglist;
 1615         uint32_t hash;
 1616         int flag;
 1617         int len;
 1618         bool neg_locked;
 1619         int lnumcache;
 1620 
 1621         CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 1622         VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 1623             ("cache_enter: Adding a doomed vnode"));
 1624         VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
 1625             ("cache_enter: Doomed vnode used as src"));
 1626 
 1627         if (__predict_false(!doingcache))
 1628                 return;
 1629 
 1630         /*
 1631          * Avoid blowout in namecache entries.
 1632          */
 1633         if (__predict_false(numcache >= desiredvnodes * ncsizefactor))
 1634                 return;
 1635 
 1636         cache_celockstate_init(&cel);
 1637         ndd = NULL;
 1638         ncp_ts = NULL;
 1639         flag = 0;
 1640         if (cnp->cn_nameptr[0] == '.') {
 1641                 if (cnp->cn_namelen == 1)
 1642                         return;
 1643                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 1644                         len = cnp->cn_namelen;
 1645                         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 1646                         cache_enter_lock_dd(&cel, dvp, vp, hash);
 1647                         /*
 1648                          * If dotdot entry already exists, just retarget it
 1649                          * to new parent vnode, otherwise continue with new
 1650                          * namecache entry allocation.
 1651                          */
 1652                         if ((ncp = dvp->v_cache_dd) != NULL &&
 1653                             ncp->nc_flag & NCF_ISDOTDOT) {
 1654                                 KASSERT(ncp->nc_dvp == dvp,
 1655                                     ("wrong isdotdot parent"));
 1656                                 neg_locked = false;
 1657                                 if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) {
 1658                                         neglist = NCP2NEGLIST(ncp);
 1659                                         mtx_lock(&ncneg_hot.nl_lock);
 1660                                         mtx_lock(&neglist->nl_lock);
 1661                                         neg_locked = true;
 1662                                 }
 1663                                 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1664                                         TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
 1665                                             ncp, nc_dst);
 1666                                 } else {
 1667                                         cache_negative_remove(ncp, true);
 1668                                 }
 1669                                 if (vp != NULL) {
 1670                                         TAILQ_INSERT_HEAD(&vp->v_cache_dst,
 1671                                             ncp, nc_dst);
 1672                                         ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE);
 1673                                 } else {
 1674                                         ncp->nc_flag &= ~(NCF_HOTNEGATIVE);
 1675                                         ncp->nc_flag |= NCF_NEGATIVE;
 1676                                         cache_negative_insert(ncp, true);
 1677                                 }
 1678                                 if (neg_locked) {
 1679                                         mtx_unlock(&neglist->nl_lock);
 1680                                         mtx_unlock(&ncneg_hot.nl_lock);
 1681                                 }
 1682                                 ncp->nc_vp = vp;
 1683                                 cache_enter_unlock(&cel);
 1684                                 return;
 1685                         }
 1686                         dvp->v_cache_dd = NULL;
 1687                         cache_enter_unlock(&cel);
 1688                         cache_celockstate_init(&cel);
 1689                         SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
 1690                         flag = NCF_ISDOTDOT;
 1691                 }
 1692         }
 1693 
 1694         /*
 1695          * Calculate the hash key and setup as much of the new
 1696          * namecache entry as possible before acquiring the lock.
 1697          */
 1698         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 1699         ncp->nc_flag = flag;
 1700         ncp->nc_vp = vp;
 1701         if (vp == NULL)
 1702                 ncp->nc_flag |= NCF_NEGATIVE;
 1703         ncp->nc_dvp = dvp;
 1704         if (tsp != NULL) {
 1705                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 1706                 ncp_ts->nc_time = *tsp;
 1707                 ncp_ts->nc_ticks = ticks;
 1708                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
 1709                 if (dtsp != NULL) {
 1710                         ncp_ts->nc_dotdottime = *dtsp;
 1711                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 1712                 }
 1713         }
 1714         len = ncp->nc_nlen = cnp->cn_namelen;
 1715         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 1716         strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
 1717         cache_enter_lock(&cel, dvp, vp, hash);
 1718 
 1719         /*
 1720          * See if this vnode or negative entry is already in the cache
 1721          * with this name.  This can happen with concurrent lookups of
 1722          * the same path name.
 1723          */
 1724         ncpp = NCHHASH(hash);
 1725         LIST_FOREACH(n2, ncpp, nc_hash) {
 1726                 if (n2->nc_dvp == dvp &&
 1727                     n2->nc_nlen == cnp->cn_namelen &&
 1728                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 1729                         if (tsp != NULL) {
 1730                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
 1731                                     ("no NCF_TS"));
 1732                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 1733                                 n2_ts->nc_time = ncp_ts->nc_time;
 1734                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
 1735                                 if (dtsp != NULL) {
 1736                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 1737                                         if (ncp->nc_flag & NCF_NEGATIVE)
 1738                                                 mtx_lock(&ncneg_hot.nl_lock);
 1739                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
 1740                                         if (ncp->nc_flag & NCF_NEGATIVE)
 1741                                                 mtx_unlock(&ncneg_hot.nl_lock);
 1742                                 }
 1743                         }
 1744                         goto out_unlock_free;
 1745                 }
 1746         }
 1747 
 1748         if (flag == NCF_ISDOTDOT) {
 1749                 /*
 1750                  * See if we are trying to add .. entry, but some other lookup
 1751                  * has populated v_cache_dd pointer already.
 1752                  */
 1753                 if (dvp->v_cache_dd != NULL)
 1754                         goto out_unlock_free;
 1755                 KASSERT(vp == NULL || vp->v_type == VDIR,
 1756                     ("wrong vnode type %p", vp));
 1757                 dvp->v_cache_dd = ncp;
 1758         }
 1759 
 1760         if (vp != NULL) {
 1761                 if (vp->v_type == VDIR) {
 1762                         if (flag != NCF_ISDOTDOT) {
 1763                                 /*
 1764                                  * For this case, the cache entry maps both the
 1765                                  * directory name in it and the name ".." for the
 1766                                  * directory's parent.
 1767                                  */
 1768                                 if ((ndd = vp->v_cache_dd) != NULL) {
 1769                                         if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 1770                                                 cache_zap_locked(ndd, false);
 1771                                         else
 1772                                                 ndd = NULL;
 1773                                 }
 1774                                 vp->v_cache_dd = ncp;
 1775                         }
 1776                 } else {
 1777                         vp->v_cache_dd = NULL;
 1778                 }
 1779         }
 1780 
 1781         if (flag != NCF_ISDOTDOT) {
 1782                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 1783                         vhold(dvp);
 1784                         atomic_add_rel_long(&numcachehv, 1);
 1785                 }
 1786                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 1787         }
 1788 
 1789         /*
 1790          * Insert the new namecache entry into the appropriate chain
 1791          * within the cache entries table.
 1792          */
 1793         LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 1794 
 1795         /*
 1796          * If the entry is "negative", we place it into the
 1797          * "negative" cache queue, otherwise, we place it into the
 1798          * destination vnode's cache entries queue.
 1799          */
 1800         if (vp != NULL) {
 1801                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 1802                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 1803                     vp);
 1804         } else {
 1805                 if (cnp->cn_flags & ISWHITEOUT)
 1806                         ncp->nc_flag |= NCF_WHITE;
 1807                 cache_negative_insert(ncp, false);
 1808                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 1809                     ncp->nc_name);
 1810         }
 1811         cache_enter_unlock(&cel);
 1812         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 1813         if (numneg * ncnegfactor > lnumcache)
 1814                 cache_negative_zap_one();
 1815         cache_free(ndd);
 1816         return;
 1817 out_unlock_free:
 1818         cache_enter_unlock(&cel);
 1819         cache_free(ncp);
 1820         return;
 1821 }
 1822 
 1823 static u_int
 1824 cache_roundup_2(u_int val)
 1825 {
 1826         u_int res;
 1827 
 1828         for (res = 1; res <= val; res <<= 1)
 1829                 continue;
 1830 
 1831         return (res);
 1832 }
 1833 
 1834 /*
 1835  * Name cache initialization, from vfs_init() when we are booting
 1836  */
 1837 static void
 1838 nchinit(void *dummy __unused)
 1839 {
 1840         u_int i;
 1841 
 1842         cache_zone_small = uma_zcreate("S VFS Cache",
 1843             sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
 1844             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
 1845             UMA_ZONE_ZINIT);
 1846         cache_zone_small_ts = uma_zcreate("STS VFS Cache",
 1847             sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
 1848             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
 1849             UMA_ZONE_ZINIT);
 1850         cache_zone_large = uma_zcreate("L VFS Cache",
 1851             sizeof(struct namecache) + NAME_MAX + 1,
 1852             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
 1853             UMA_ZONE_ZINIT);
 1854         cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
 1855             sizeof(struct namecache_ts) + NAME_MAX + 1,
 1856             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
 1857             UMA_ZONE_ZINIT);
 1858 
 1859         nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 1860         ncbuckethash = cache_roundup_2(mp_ncpus * 64) - 1;
 1861         if (ncbuckethash > nchash)
 1862                 ncbuckethash = nchash;
 1863         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 1864             M_WAITOK | M_ZERO);
 1865         for (i = 0; i < numbucketlocks; i++)
 1866                 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
 1867         ncvnodehash = cache_roundup_2(mp_ncpus * 64) - 1;
 1868         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 1869             M_WAITOK | M_ZERO);
 1870         for (i = 0; i < numvnodelocks; i++)
 1871                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 1872         ncpurgeminvnodes = numbucketlocks;
 1873 
 1874         ncneghash = 3;
 1875         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
 1876             M_WAITOK | M_ZERO);
 1877         for (i = 0; i < numneglists; i++) {
 1878                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 1879                 TAILQ_INIT(&neglists[i].nl_list);
 1880         }
 1881         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
 1882         TAILQ_INIT(&ncneg_hot.nl_list);
 1883 
 1884         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
 1885 
 1886         numcalls = counter_u64_alloc(M_WAITOK);
 1887         dothits = counter_u64_alloc(M_WAITOK);
 1888         dotdothits = counter_u64_alloc(M_WAITOK);
 1889         numchecks = counter_u64_alloc(M_WAITOK);
 1890         nummiss = counter_u64_alloc(M_WAITOK);
 1891         nummisszap = counter_u64_alloc(M_WAITOK);
 1892         numposzaps = counter_u64_alloc(M_WAITOK);
 1893         numposhits = counter_u64_alloc(M_WAITOK);
 1894         numnegzaps = counter_u64_alloc(M_WAITOK);
 1895         numneghits = counter_u64_alloc(M_WAITOK);
 1896         numfullpathcalls = counter_u64_alloc(M_WAITOK);
 1897         numfullpathfail1 = counter_u64_alloc(M_WAITOK);
 1898         numfullpathfail2 = counter_u64_alloc(M_WAITOK);
 1899         numfullpathfail4 = counter_u64_alloc(M_WAITOK);
 1900         numfullpathfound = counter_u64_alloc(M_WAITOK);
 1901 }
 1902 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 1903 
 1904 void
 1905 cache_changesize(int newmaxvnodes)
 1906 {
 1907         struct nchashhead *new_nchashtbl, *old_nchashtbl;
 1908         u_long new_nchash, old_nchash;
 1909         struct namecache *ncp;
 1910         uint32_t hash;
 1911         int i;
 1912 
 1913         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 1914         if (newmaxvnodes < numbucketlocks)
 1915                 newmaxvnodes = numbucketlocks;
 1916 
 1917         new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
 1918         /* If same hash table size, nothing to do */
 1919         if (nchash == new_nchash) {
 1920                 free(new_nchashtbl, M_VFSCACHE);
 1921                 return;
 1922         }
 1923         /*
 1924          * Move everything from the old hash table to the new table.
 1925          * None of the namecache entries in the table can be removed
 1926          * because to do so, they have to be removed from the hash table.
 1927          */
 1928         cache_lock_all_vnodes();
 1929         cache_lock_all_buckets();
 1930         old_nchashtbl = nchashtbl;
 1931         old_nchash = nchash;
 1932         nchashtbl = new_nchashtbl;
 1933         nchash = new_nchash;
 1934         for (i = 0; i <= old_nchash; i++) {
 1935                 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
 1936                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 1937                             ncp->nc_dvp);
 1938                         LIST_REMOVE(ncp, nc_hash);
 1939                         LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 1940                 }
 1941         }
 1942         cache_unlock_all_buckets();
 1943         cache_unlock_all_vnodes();
 1944         free(old_nchashtbl, M_VFSCACHE);
 1945 }
 1946 
 1947 /*
 1948  * Invalidate all entries to a particular vnode.
 1949  */
 1950 void
 1951 cache_purge(struct vnode *vp)
 1952 {
 1953         TAILQ_HEAD(, namecache) ncps;
 1954         struct namecache *ncp, *nnp;
 1955         struct mtx *vlp, *vlp2;
 1956 
 1957         CTR1(KTR_VFS, "cache_purge(%p)", vp);
 1958         SDT_PROBE1(vfs, namecache, purge, done, vp);
 1959         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 1960             vp->v_cache_dd == NULL)
 1961                 return;
 1962         TAILQ_INIT(&ncps);
 1963         vlp = VP2VNODELOCK(vp);
 1964         vlp2 = NULL;
 1965         mtx_lock(vlp);
 1966 retry:
 1967         while (!LIST_EMPTY(&vp->v_cache_src)) {
 1968                 ncp = LIST_FIRST(&vp->v_cache_src);
 1969                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 1970                         goto retry;
 1971                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 1972         }
 1973         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 1974                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 1975                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 1976                         goto retry;
 1977                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 1978         }
 1979         ncp = vp->v_cache_dd;
 1980         if (ncp != NULL) {
 1981                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 1982                    ("lost dotdot link"));
 1983                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 1984                         goto retry;
 1985                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 1986         }
 1987         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 1988         mtx_unlock(vlp);
 1989         if (vlp2 != NULL)
 1990                 mtx_unlock(vlp2);
 1991         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 1992                 cache_free(ncp);
 1993         }
 1994 }
 1995 
 1996 /*
 1997  * Invalidate all negative entries for a particular directory vnode.
 1998  */
 1999 void
 2000 cache_purge_negative(struct vnode *vp)
 2001 {
 2002         TAILQ_HEAD(, namecache) ncps;
 2003         struct namecache *ncp, *nnp;
 2004         struct mtx *vlp;
 2005 
 2006         CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
 2007         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 2008         if (LIST_EMPTY(&vp->v_cache_src))
 2009                 return;
 2010         TAILQ_INIT(&ncps);
 2011         vlp = VP2VNODELOCK(vp);
 2012         mtx_lock(vlp);
 2013         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 2014                 if (!(ncp->nc_flag & NCF_NEGATIVE))
 2015                         continue;
 2016                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 2017                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 2018         }
 2019         mtx_unlock(vlp);
 2020         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 2021                 cache_free(ncp);
 2022         }
 2023 }
 2024 
 2025 /*
 2026  * Flush all entries referencing a particular filesystem.
 2027  */
 2028 void
 2029 cache_purgevfs(struct mount *mp, bool force)
 2030 {
 2031         TAILQ_HEAD(, namecache) ncps;
 2032         struct mtx *vlp1, *vlp2;
 2033         struct rwlock *blp;
 2034         struct nchashhead *bucket;
 2035         struct namecache *ncp, *nnp;
 2036         u_long i, j, n_nchash;
 2037         int error;
 2038 
 2039         /* Scan hash tables for applicable entries */
 2040         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 2041         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
 2042                 return;
 2043         TAILQ_INIT(&ncps);
 2044         n_nchash = nchash + 1;
 2045         vlp1 = vlp2 = NULL;
 2046         for (i = 0; i < numbucketlocks; i++) {
 2047                 blp = (struct rwlock *)&bucketlocks[i];
 2048                 rw_wlock(blp);
 2049                 for (j = i; j < n_nchash; j += numbucketlocks) {
 2050 retry:
 2051                         bucket = &nchashtbl[j];
 2052                         LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
 2053                                 cache_assert_bucket_locked(ncp, RA_WLOCKED);
 2054                                 if (ncp->nc_dvp->v_mount != mp)
 2055                                         continue;
 2056                                 error = cache_zap_wlocked_bucket_kl(ncp, blp,
 2057                                     &vlp1, &vlp2);
 2058                                 if (error != 0)
 2059                                         goto retry;
 2060                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
 2061                         }
 2062                 }
 2063                 rw_wunlock(blp);
 2064                 if (vlp1 == NULL && vlp2 == NULL)
 2065                         cache_maybe_yield();
 2066         }
 2067         if (vlp1 != NULL)
 2068                 mtx_unlock(vlp1);
 2069         if (vlp2 != NULL)
 2070                 mtx_unlock(vlp2);
 2071 
 2072         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 2073                 cache_free(ncp);
 2074         }
 2075 }
 2076 
 2077 /*
 2078  * Perform canonical checks and cache lookup and pass on to filesystem
 2079  * through the vop_cachedlookup only if needed.
 2080  */
 2081 
 2082 int
 2083 vfs_cache_lookup(struct vop_lookup_args *ap)
 2084 {
 2085         struct vnode *dvp;
 2086         int error;
 2087         struct vnode **vpp = ap->a_vpp;
 2088         struct componentname *cnp = ap->a_cnp;
 2089         struct ucred *cred = cnp->cn_cred;
 2090         int flags = cnp->cn_flags;
 2091         struct thread *td = cnp->cn_thread;
 2092 
 2093         *vpp = NULL;
 2094         dvp = ap->a_dvp;
 2095 
 2096         if (dvp->v_type != VDIR)
 2097                 return (ENOTDIR);
 2098 
 2099         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 2100             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 2101                 return (EROFS);
 2102 
 2103         error = VOP_ACCESS(dvp, VEXEC, cred, td);
 2104         if (error)
 2105                 return (error);
 2106 
 2107         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 2108         if (error == 0)
 2109                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 2110         if (error == -1)
 2111                 return (0);
 2112         return (error);
 2113 }
 2114 
 2115 /*
 2116  * XXX All of these sysctls would probably be more productive dead.
 2117  */
 2118 static int __read_mostly disablecwd;
 2119 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
 2120    "Disable the getcwd syscall");
 2121 
 2122 /* Implementation of the getcwd syscall. */
 2123 int
 2124 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 2125 {
 2126 
 2127         return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
 2128             MAXPATHLEN));
 2129 }
 2130 
 2131 int
 2132 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen,
 2133     size_t path_max)
 2134 {
 2135         char *bp, *tmpbuf;
 2136         struct filedesc *fdp;
 2137         struct vnode *cdir, *rdir;
 2138         int error;
 2139 
 2140         if (__predict_false(disablecwd))
 2141                 return (ENODEV);
 2142         if (__predict_false(buflen < 2))
 2143                 return (EINVAL);
 2144         if (buflen > path_max)
 2145                 buflen = path_max;
 2146 
 2147         tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 2148         fdp = td->td_proc->p_fd;
 2149         FILEDESC_SLOCK(fdp);
 2150         cdir = fdp->fd_cdir;
 2151         vrefact(cdir);
 2152         rdir = fdp->fd_rdir;
 2153         vrefact(rdir);
 2154         FILEDESC_SUNLOCK(fdp);
 2155         error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
 2156         vrele(rdir);
 2157         vrele(cdir);
 2158 
 2159         if (!error) {
 2160                 if (bufseg == UIO_SYSSPACE)
 2161                         bcopy(bp, buf, strlen(bp) + 1);
 2162                 else
 2163                         error = copyout(bp, buf, strlen(bp) + 1);
 2164 #ifdef KTRACE
 2165         if (KTRPOINT(curthread, KTR_NAMEI))
 2166                 ktrnamei(bp);
 2167 #endif
 2168         }
 2169         free(tmpbuf, M_TEMP);
 2170         return (error);
 2171 }
 2172 
 2173 /*
 2174  * Thus begins the fullpath magic.
 2175  */
 2176 
 2177 static int __read_mostly disablefullpath;
 2178 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
 2179     "Disable the vn_fullpath function");
 2180 
 2181 /*
 2182  * Retrieve the full filesystem path that correspond to a vnode from the name
 2183  * cache (if available)
 2184  */
 2185 int
 2186 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 2187 {
 2188         char *buf;
 2189         struct filedesc *fdp;
 2190         struct vnode *rdir;
 2191         int error;
 2192 
 2193         if (__predict_false(disablefullpath))
 2194                 return (ENODEV);
 2195         if (__predict_false(vn == NULL))
 2196                 return (EINVAL);
 2197 
 2198         buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 2199         fdp = td->td_proc->p_fd;
 2200         FILEDESC_SLOCK(fdp);
 2201         rdir = fdp->fd_rdir;
 2202         vrefact(rdir);
 2203         FILEDESC_SUNLOCK(fdp);
 2204         error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
 2205         vrele(rdir);
 2206 
 2207         if (!error)
 2208                 *freebuf = buf;
 2209         else
 2210                 free(buf, M_TEMP);
 2211         return (error);
 2212 }
 2213 
 2214 /*
 2215  * This function is similar to vn_fullpath, but it attempts to lookup the
 2216  * pathname relative to the global root mount point.  This is required for the
 2217  * auditing sub-system, as audited pathnames must be absolute, relative to the
 2218  * global root mount point.
 2219  */
 2220 int
 2221 vn_fullpath_global(struct thread *td, struct vnode *vn,
 2222     char **retbuf, char **freebuf)
 2223 {
 2224         char *buf;
 2225         int error;
 2226 
 2227         if (__predict_false(disablefullpath))
 2228                 return (ENODEV);
 2229         if (__predict_false(vn == NULL))
 2230                 return (EINVAL);
 2231         buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 2232         error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
 2233         if (!error)
 2234                 *freebuf = buf;
 2235         else
 2236                 free(buf, M_TEMP);
 2237         return (error);
 2238 }
 2239 
 2240 int
 2241 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
 2242 {
 2243         struct vnode *dvp;
 2244         struct namecache *ncp;
 2245         struct mtx *vlp;
 2246         int error;
 2247 
 2248         vlp = VP2VNODELOCK(*vp);
 2249         mtx_lock(vlp);
 2250         TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 2251                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 2252                         break;
 2253         }
 2254         if (ncp != NULL) {
 2255                 if (*buflen < ncp->nc_nlen) {
 2256                         mtx_unlock(vlp);
 2257                         vrele(*vp);
 2258                         counter_u64_add(numfullpathfail4, 1);
 2259                         error = ENOMEM;
 2260                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 2261                             vp, NULL);
 2262                         return (error);
 2263                 }
 2264                 *buflen -= ncp->nc_nlen;
 2265                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 2266                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 2267                     ncp->nc_name, vp);
 2268                 dvp = *vp;
 2269                 *vp = ncp->nc_dvp;
 2270                 vref(*vp);
 2271                 mtx_unlock(vlp);
 2272                 vrele(dvp);
 2273                 return (0);
 2274         }
 2275         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 2276 
 2277         mtx_unlock(vlp);
 2278         vn_lock(*vp, LK_SHARED | LK_RETRY);
 2279         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 2280         vput(*vp);
 2281         if (error) {
 2282                 counter_u64_add(numfullpathfail2, 1);
 2283                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 2284                 return (error);
 2285         }
 2286 
 2287         *vp = dvp;
 2288         if (dvp->v_iflag & VI_DOOMED) {
 2289                 /* forced unmount */
 2290                 vrele(dvp);
 2291                 error = ENOENT;
 2292                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 2293                 return (error);
 2294         }
 2295         /*
 2296          * *vp has its use count incremented still.
 2297          */
 2298 
 2299         return (0);
 2300 }
 2301 
 2302 /*
 2303  * The magic behind kern___getcwd() and vn_fullpath().
 2304  */
 2305 static int
 2306 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 2307     char *buf, char **retbuf, u_int buflen)
 2308 {
 2309         int error, slash_prefixed;
 2310 #ifdef KDTRACE_HOOKS
 2311         struct vnode *startvp = vp;
 2312 #endif
 2313         struct vnode *vp1;
 2314 
 2315         buflen--;
 2316         buf[buflen] = '\0';
 2317         error = 0;
 2318         slash_prefixed = 0;
 2319 
 2320         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 2321         counter_u64_add(numfullpathcalls, 1);
 2322         vref(vp);
 2323         if (vp->v_type != VDIR) {
 2324                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 2325                 if (error)
 2326                         return (error);
 2327                 if (buflen == 0) {
 2328                         vrele(vp);
 2329                         return (ENOMEM);
 2330                 }
 2331                 buf[--buflen] = '/';
 2332                 slash_prefixed = 1;
 2333         }
 2334         while (vp != rdir && vp != rootvnode) {
 2335                 /*
 2336                  * The vp vnode must be already fully constructed,
 2337                  * since it is either found in namecache or obtained
 2338                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 2339                  * without obtaining the vnode lock.
 2340                  */
 2341                 if ((vp->v_vflag & VV_ROOT) != 0) {
 2342                         vn_lock(vp, LK_RETRY | LK_SHARED);
 2343 
 2344                         /*
 2345                          * With the vnode locked, check for races with
 2346                          * unmount, forced or not.  Note that we
 2347                          * already verified that vp is not equal to
 2348                          * the root vnode, which means that
 2349                          * mnt_vnodecovered can be NULL only for the
 2350                          * case of unmount.
 2351                          */
 2352                         if ((vp->v_iflag & VI_DOOMED) != 0 ||
 2353                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 2354                             vp1->v_mountedhere != vp->v_mount) {
 2355                                 vput(vp);
 2356                                 error = ENOENT;
 2357                                 SDT_PROBE3(vfs, namecache, fullpath, return,
 2358                                     error, vp, NULL);
 2359                                 break;
 2360                         }
 2361 
 2362                         vref(vp1);
 2363                         vput(vp);
 2364                         vp = vp1;
 2365                         continue;
 2366                 }
 2367                 if (vp->v_type != VDIR) {
 2368                         vrele(vp);
 2369                         counter_u64_add(numfullpathfail1, 1);
 2370                         error = ENOTDIR;
 2371                         SDT_PROBE3(vfs, namecache, fullpath, return,
 2372                             error, vp, NULL);
 2373                         break;
 2374                 }
 2375                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 2376                 if (error)
 2377                         break;
 2378                 if (buflen == 0) {
 2379                         vrele(vp);
 2380                         error = ENOMEM;
 2381                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 2382                             startvp, NULL);
 2383                         break;
 2384                 }
 2385                 buf[--buflen] = '/';
 2386                 slash_prefixed = 1;
 2387         }
 2388         if (error)
 2389                 return (error);
 2390         if (!slash_prefixed) {
 2391                 if (buflen == 0) {
 2392                         vrele(vp);
 2393                         counter_u64_add(numfullpathfail4, 1);
 2394                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 2395                             startvp, NULL);
 2396                         return (ENOMEM);
 2397                 }
 2398                 buf[--buflen] = '/';
 2399         }
 2400         counter_u64_add(numfullpathfound, 1);
 2401         vrele(vp);
 2402 
 2403         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
 2404         *retbuf = buf + buflen;
 2405         return (0);
 2406 }
 2407 
 2408 struct vnode *
 2409 vn_dir_dd_ino(struct vnode *vp)
 2410 {
 2411         struct namecache *ncp;
 2412         struct vnode *ddvp;
 2413         struct mtx *vlp;
 2414 
 2415         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 2416         vlp = VP2VNODELOCK(vp);
 2417         mtx_lock(vlp);
 2418         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 2419                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 2420                         continue;
 2421                 ddvp = ncp->nc_dvp;
 2422                 vhold(ddvp);
 2423                 mtx_unlock(vlp);
 2424                 if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
 2425                         return (NULL);
 2426                 return (ddvp);
 2427         }
 2428         mtx_unlock(vlp);
 2429         return (NULL);
 2430 }
 2431 
 2432 int
 2433 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 2434 {
 2435         struct namecache *ncp;
 2436         struct mtx *vlp;
 2437         int l;
 2438 
 2439         vlp = VP2VNODELOCK(vp);
 2440         mtx_lock(vlp);
 2441         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 2442                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 2443                         break;
 2444         if (ncp == NULL) {
 2445                 mtx_unlock(vlp);
 2446                 return (ENOENT);
 2447         }
 2448         l = min(ncp->nc_nlen, buflen - 1);
 2449         memcpy(buf, ncp->nc_name, l);
 2450         mtx_unlock(vlp);
 2451         buf[l] = '\0';
 2452         return (0);
 2453 }
 2454 
 2455 /* ABI compat shims for old kernel modules. */
 2456 #undef cache_enter
 2457 
 2458 void    cache_enter(struct vnode *dvp, struct vnode *vp,
 2459             struct componentname *cnp);
 2460 
 2461 void
 2462 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 2463 {
 2464 
 2465         cache_enter_time(dvp, vp, cnp, NULL, NULL);
 2466 }
 2467 
 2468 /*
 2469  * This function updates path string to vnode's full global path
 2470  * and checks the size of the new path string against the pathlen argument.
 2471  *
 2472  * Requires a locked, referenced vnode.
 2473  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 2474  *
 2475  * If sysctl debug.disablefullpath is set, ENODEV is returned,
 2476  * vnode is left locked and path remain untouched.
 2477  *
 2478  * If vp is a directory, the call to vn_fullpath_global() always succeeds
 2479  * because it falls back to the ".." lookup if the namecache lookup fails.
 2480  */
 2481 int
 2482 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
 2483     u_int pathlen)
 2484 {
 2485         struct nameidata nd;
 2486         struct vnode *vp1;
 2487         char *rpath, *fbuf;
 2488         int error;
 2489 
 2490         ASSERT_VOP_ELOCKED(vp, __func__);
 2491 
 2492         /* Return ENODEV if sysctl debug.disablefullpath==1 */
 2493         if (__predict_false(disablefullpath))
 2494                 return (ENODEV);
 2495 
 2496         /* Construct global filesystem path from vp. */
 2497         VOP_UNLOCK(vp, 0);
 2498         error = vn_fullpath_global(td, vp, &rpath, &fbuf);
 2499 
 2500         if (error != 0) {
 2501                 vrele(vp);
 2502                 return (error);
 2503         }
 2504 
 2505         if (strlen(rpath) >= pathlen) {
 2506                 vrele(vp);
 2507                 error = ENAMETOOLONG;
 2508                 goto out;
 2509         }
 2510 
 2511         /*
 2512          * Re-lookup the vnode by path to detect a possible rename.
 2513          * As a side effect, the vnode is relocked.
 2514          * If vnode was renamed, return ENOENT.
 2515          */
 2516         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 2517             UIO_SYSSPACE, path, td);
 2518         error = namei(&nd);
 2519         if (error != 0) {
 2520                 vrele(vp);
 2521                 goto out;
 2522         }
 2523         NDFREE(&nd, NDF_ONLY_PNBUF);
 2524         vp1 = nd.ni_vp;
 2525         vrele(vp);
 2526         if (vp1 == vp)
 2527                 strcpy(path, rpath);
 2528         else {
 2529                 vput(vp1);
 2530                 error = ENOENT;
 2531         }
 2532 
 2533 out:
 2534         free(fbuf, M_TEMP);
 2535         return (error);
 2536 }
 2537 
 2538 #ifdef DDB
 2539 static void
 2540 db_print_vpath(struct vnode *vp)
 2541 {
 2542 
 2543         while (vp != NULL) {
 2544                 db_printf("%p: ", vp);
 2545                 if (vp == rootvnode) {
 2546                         db_printf("/");
 2547                         vp = NULL;
 2548                 } else {
 2549                         if (vp->v_vflag & VV_ROOT) {
 2550                                 db_printf("<mount point>");
 2551                                 vp = vp->v_mount->mnt_vnodecovered;
 2552                         } else {
 2553                                 struct namecache *ncp;
 2554                                 char *ncn;
 2555                                 int i;
 2556 
 2557                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 2558                                 if (ncp != NULL) {
 2559                                         ncn = ncp->nc_name;
 2560                                         for (i = 0; i < ncp->nc_nlen; i++)
 2561                                                 db_printf("%c", *ncn++);
 2562                                         vp = ncp->nc_dvp;
 2563                                 } else {
 2564                                         vp = NULL;
 2565                                 }
 2566                         }
 2567                 }
 2568                 db_printf("\n");
 2569         }
 2570 
 2571         return;
 2572 }
 2573 
 2574 DB_SHOW_COMMAND(vpath, db_show_vpath)
 2575 {
 2576         struct vnode *vp;
 2577 
 2578         if (!have_addr) {
 2579                 db_printf("usage: show vpath <struct vnode *>\n");
 2580                 return;
 2581         }
 2582 
 2583         vp = (struct vnode *)addr;
 2584         db_print_vpath(vp);
 2585 }
 2586 
 2587 #endif

Cache object: 11c61555d0e0084975ef18ed8b453109


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.