The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1989, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Poul-Henning Kamp of the FreeBSD Project.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 4. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
   33  */
   34 
   35 #include <sys/cdefs.h>
   36 __FBSDID("$FreeBSD: releng/11.1/sys/kern/vfs_cache.c 318367 2017-05-16 19:35:25Z mjg $");
   37 
   38 #include "opt_ktrace.h"
   39 
   40 #include <sys/param.h>
   41 #include <sys/systm.h>
   42 #include <sys/counter.h>
   43 #include <sys/filedesc.h>
   44 #include <sys/fnv_hash.h>
   45 #include <sys/kernel.h>
   46 #include <sys/lock.h>
   47 #include <sys/malloc.h>
   48 #include <sys/fcntl.h>
   49 #include <sys/mount.h>
   50 #include <sys/namei.h>
   51 #include <sys/proc.h>
   52 #include <sys/rwlock.h>
   53 #include <sys/sdt.h>
   54 #include <sys/smp.h>
   55 #include <sys/syscallsubr.h>
   56 #include <sys/sysctl.h>
   57 #include <sys/sysproto.h>
   58 #include <sys/vnode.h>
   59 #ifdef KTRACE
   60 #include <sys/ktrace.h>
   61 #endif
   62 
   63 #include <vm/uma.h>
   64 
   65 SDT_PROVIDER_DECLARE(vfs);
   66 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
   67     "struct vnode *");
   68 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
   69     "char *");
   70 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
   71 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
   72     "char *", "struct vnode *");
   73 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
   74 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
   75     "struct vnode *", "char *");
   76 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
   77     "struct vnode *");
   78 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
   79     "struct vnode *", "char *");
   80 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
   81     "char *");
   82 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
   83 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
   84 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
   85 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
   86     "struct vnode *");
   87 SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *",
   88     "char *", "int");
   89 SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *",
   90     "char *", "int");
   91 
   92 /*
   93  * This structure describes the elements in the cache of recent
   94  * names looked up by namei.
   95  */
   96 
   97 struct  namecache {
   98         LIST_ENTRY(namecache) nc_hash;  /* hash chain */
   99         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  100         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  101         struct  vnode *nc_dvp;          /* vnode of parent of name */
  102         union {
  103                 struct  vnode *nu_vp;   /* vnode the name refers to */
  104                 u_int   nu_neghits;     /* negative entry hits */
  105         } n_un;
  106         u_char  nc_flag;                /* flag bits */
  107         u_char  nc_nlen;                /* length of name */
  108         char    nc_name[0];             /* segment name + nul */
  109 };
  110 
  111 /*
  112  * struct namecache_ts repeats struct namecache layout up to the
  113  * nc_nlen member.
  114  * struct namecache_ts is used in place of struct namecache when time(s) need
  115  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  116  * both a non-dotdot directory name plus dotdot for the directory's
  117  * parent.
  118  */
  119 struct  namecache_ts {
  120         LIST_ENTRY(namecache) nc_hash;  /* hash chain */
  121         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
  122         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
  123         struct  vnode *nc_dvp;          /* vnode of parent of name */
  124         union {
  125                 struct  vnode *nu_vp;   /* vnode the name refers to */
  126                 u_int   nu_neghits;     /* negative entry hits */
  127         } n_un;
  128         u_char  nc_flag;                /* flag bits */
  129         u_char  nc_nlen;                /* length of name */
  130         struct  timespec nc_time;       /* timespec provided by fs */
  131         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
  132         int     nc_ticks;               /* ticks value when entry was added */
  133         char    nc_name[0];             /* segment name + nul */
  134 };
  135 
  136 #define nc_vp           n_un.nu_vp
  137 #define nc_neghits      n_un.nu_neghits
  138 
  139 /*
  140  * Flags in namecache.nc_flag
  141  */
  142 #define NCF_WHITE       0x01
  143 #define NCF_ISDOTDOT    0x02
  144 #define NCF_TS          0x04
  145 #define NCF_DTS         0x08
  146 #define NCF_DVDROP      0x10
  147 #define NCF_NEGATIVE    0x20
  148 #define NCF_HOTNEGATIVE 0x40
  149 
  150 /*
  151  * Name caching works as follows:
  152  *
  153  * Names found by directory scans are retained in a cache
  154  * for future reference.  It is managed LRU, so frequently
  155  * used names will hang around.  Cache is indexed by hash value
  156  * obtained from (vp, name) where vp refers to the directory
  157  * containing name.
  158  *
  159  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  160  * exist) the vnode pointer will be NULL.
  161  *
  162  * Upon reaching the last segment of a path, if the reference
  163  * is for DELETE, or NOCACHE is set (rewrite), and the
  164  * name is located in the cache, it will be dropped.
  165  *
  166  * These locks are used (in the order in which they can be taken):
  167  * NAME         TYPE    ROLE
  168  * vnodelock    mtx     vnode lists and v_cache_dd field protection
  169  * bucketlock   rwlock  for access to given set of hash buckets
  170  * neglist      mtx     negative entry LRU management
  171  *
  172  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
  173  * shrinking the LRU list.
  174  *
  175  * It is legal to take multiple vnodelock and bucketlock locks. The locking
  176  * order is lower address first. Both are recursive.
  177  *
  178  * "." lookups are lockless.
  179  *
  180  * ".." and vnode -> name lookups require vnodelock.
  181  *
  182  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
  183  *
  184  * Insertions and removals of entries require involved vnodes and bucketlocks
  185  * to be write-locked to prevent other threads from seeing the entry.
  186  *
  187  * Some lookups result in removal of the found entry (e.g. getting rid of a
  188  * negative entry with the intent to create a positive one), which poses a
  189  * problem when multiple threads reach the state. Similarly, two different
  190  * threads can purge two different vnodes and try to remove the same name.
  191  *
  192  * If the already held vnode lock is lower than the second required lock, we
  193  * can just take the other lock. However, in the opposite case, this could
  194  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
  195  * the first node, locking everything in order and revalidating the state.
  196  */
  197 
  198 /*
  199  * Structures associated with name caching.
  200  */
  201 #define NCHHASH(hash) \
  202         (&nchashtbl[(hash) & nchash])
  203 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
  204 static u_long __read_mostly     nchash;                 /* size of hash table */
  205 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  206     "Size of namecache hash table");
  207 static u_long __read_mostly     ncnegfactor = 16; /* ratio of negative entries */
  208 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
  209     "Ratio of negative namecache entries");
  210 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
  211 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
  212     "Number of negative entries in namecache");
  213 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
  214 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
  215     "Number of namecache entries");
  216 static u_long __exclusive_cache_line    numcachehv;/* number of cache entries with vnodes held */
  217 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
  218     "Number of namecache entries with vnodes held");
  219 u_int __read_mostly     ncsizefactor = 2;
  220 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
  221     "Size factor for namecache");
  222 static u_int __read_mostly      ncpurgeminvnodes;
  223 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
  224     "Number of vnodes below which purgevfs ignores the request");
  225 static u_int __read_mostly      ncneghitsrequeue = 8;
  226 SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0,
  227     "Number of hits to requeue a negative entry in the LRU list");
  228 
  229 struct nchstats nchstats;               /* cache effectiveness statistics */
  230 
  231 static struct mtx       ncneg_shrink_lock;
  232 static int      shrink_list_turn;
  233 
  234 struct neglist {
  235         struct mtx              nl_lock;
  236         TAILQ_HEAD(, namecache) nl_list;
  237 } __aligned(CACHE_LINE_SIZE);
  238 
  239 static struct neglist __read_mostly     *neglists;
  240 static struct neglist ncneg_hot;
  241 
  242 #define numneglists (ncneghash + 1)
  243 static u_int __read_mostly      ncneghash;
  244 static inline struct neglist *
  245 NCP2NEGLIST(struct namecache *ncp)
  246 {
  247 
  248         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
  249 }
  250 
  251 #define numbucketlocks (ncbuckethash + 1)
  252 static u_int __read_mostly  ncbuckethash;
  253 static struct rwlock_padalign __read_mostly  *bucketlocks;
  254 #define HASH2BUCKETLOCK(hash) \
  255         ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
  256 
  257 #define numvnodelocks (ncvnodehash + 1)
  258 static u_int __read_mostly  ncvnodehash;
  259 static struct mtx __read_mostly *vnodelocks;
  260 static inline struct mtx *
  261 VP2VNODELOCK(struct vnode *vp)
  262 {
  263 
  264         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
  265 }
  266 
  267 /*
  268  * UMA zones for the VFS cache.
  269  *
  270  * The small cache is used for entries with short names, which are the
  271  * most common.  The large cache is used for entries which are too big to
  272  * fit in the small cache.
  273  */
  274 static uma_zone_t __read_mostly cache_zone_small;
  275 static uma_zone_t __read_mostly cache_zone_small_ts;
  276 static uma_zone_t __read_mostly cache_zone_large;
  277 static uma_zone_t __read_mostly cache_zone_large_ts;
  278 
  279 #define CACHE_PATH_CUTOFF       35
  280 
  281 static struct namecache *
  282 cache_alloc(int len, int ts)
  283 {
  284 
  285         if (len > CACHE_PATH_CUTOFF) {
  286                 if (ts)
  287                         return (uma_zalloc(cache_zone_large_ts, M_WAITOK));
  288                 else
  289                         return (uma_zalloc(cache_zone_large, M_WAITOK));
  290         }
  291         if (ts)
  292                 return (uma_zalloc(cache_zone_small_ts, M_WAITOK));
  293         else
  294                 return (uma_zalloc(cache_zone_small, M_WAITOK));
  295 }
  296 
  297 static void
  298 cache_free(struct namecache *ncp)
  299 {
  300         int ts;
  301 
  302         if (ncp == NULL)
  303                 return;
  304         ts = ncp->nc_flag & NCF_TS;
  305         if ((ncp->nc_flag & NCF_DVDROP) != 0)
  306                 vdrop(ncp->nc_dvp);
  307         if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) {
  308                 if (ts)
  309                         uma_zfree(cache_zone_small_ts, ncp);
  310                 else
  311                         uma_zfree(cache_zone_small, ncp);
  312         } else if (ts)
  313                 uma_zfree(cache_zone_large_ts, ncp);
  314         else
  315                 uma_zfree(cache_zone_large, ncp);
  316 }
  317 
  318 static char *
  319 nc_get_name(struct namecache *ncp)
  320 {
  321         struct namecache_ts *ncp_ts;
  322 
  323         if ((ncp->nc_flag & NCF_TS) == 0)
  324                 return (ncp->nc_name);
  325         ncp_ts = (struct namecache_ts *)ncp;
  326         return (ncp_ts->nc_name);
  327 }
  328 
  329 static void
  330 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
  331 {
  332 
  333         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
  334             (tsp == NULL && ticksp == NULL),
  335             ("No NCF_TS"));
  336 
  337         if (tsp != NULL)
  338                 *tsp = ((struct namecache_ts *)ncp)->nc_time;
  339         if (ticksp != NULL)
  340                 *ticksp = ((struct namecache_ts *)ncp)->nc_ticks;
  341 }
  342 
  343 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
  344 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
  345     "VFS namecache enabled");
  346 
  347 /* Export size information to userland */
  348 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
  349     sizeof(struct namecache), "sizeof(struct namecache)");
  350 
  351 /*
  352  * The new name cache statistics
  353  */
  354 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
  355     "Name cache statistics");
  356 #define STATNODE_ULONG(name, descr)     \
  357         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
  358 #define STATNODE_COUNTER(name, descr)   \
  359         static counter_u64_t __read_mostly name; \
  360         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
  361 STATNODE_ULONG(numneg, "Number of negative cache entries");
  362 STATNODE_ULONG(numcache, "Number of cache entries");
  363 STATNODE_COUNTER(numcalls, "Number of cache lookups");
  364 STATNODE_COUNTER(dothits, "Number of '.' hits");
  365 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
  366 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
  367 STATNODE_COUNTER(nummiss, "Number of cache misses");
  368 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
  369 STATNODE_COUNTER(numposzaps,
  370     "Number of cache hits (positive) we do not want to cache");
  371 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
  372 STATNODE_COUNTER(numnegzaps,
  373     "Number of cache hits (negative) we do not want to cache");
  374 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
  375 /* These count for kern___getcwd(), too. */
  376 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
  377 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
  378 STATNODE_COUNTER(numfullpathfail2,
  379     "Number of fullpath search errors (VOP_VPTOCNP failures)");
  380 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
  381 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
  382 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
  383     "Number of times zap_and_exit failed to lock");
  384 static long cache_lock_vnodes_cel_3_failures;
  385 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
  386     "Number of times 3-way vnode locking failed");
  387 
  388 static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
  389 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
  390     char *buf, char **retbuf, u_int buflen);
  391 
  392 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  393 
  394 static int cache_yield;
  395 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
  396     "Number of times cache called yield");
  397 
  398 static void
  399 cache_maybe_yield(void)
  400 {
  401 
  402         if (should_yield()) {
  403                 cache_yield++;
  404                 kern_yield(PRI_USER);
  405         }
  406 }
  407 
  408 static inline void
  409 cache_assert_vlp_locked(struct mtx *vlp)
  410 {
  411 
  412         if (vlp != NULL)
  413                 mtx_assert(vlp, MA_OWNED);
  414 }
  415 
  416 static inline void
  417 cache_assert_vnode_locked(struct vnode *vp)
  418 {
  419         struct mtx *vlp;
  420 
  421         vlp = VP2VNODELOCK(vp);
  422         cache_assert_vlp_locked(vlp);
  423 }
  424 
  425 static uint32_t
  426 cache_get_hash(char *name, u_char len, struct vnode *dvp)
  427 {
  428         uint32_t hash;
  429 
  430         hash = fnv_32_buf(name, len, FNV1_32_INIT);
  431         hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
  432         return (hash);
  433 }
  434 
  435 static inline struct rwlock *
  436 NCP2BUCKETLOCK(struct namecache *ncp)
  437 {
  438         uint32_t hash;
  439 
  440         hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
  441         return (HASH2BUCKETLOCK(hash));
  442 }
  443 
  444 #ifdef INVARIANTS
  445 static void
  446 cache_assert_bucket_locked(struct namecache *ncp, int mode)
  447 {
  448         struct rwlock *blp;
  449 
  450         blp = NCP2BUCKETLOCK(ncp);
  451         rw_assert(blp, mode);
  452 }
  453 #else
  454 #define cache_assert_bucket_locked(x, y) do { } while (0)
  455 #endif
  456 
  457 #define cache_sort(x, y)        _cache_sort((void **)(x), (void **)(y))
  458 static void
  459 _cache_sort(void **p1, void **p2)
  460 {
  461         void *tmp;
  462 
  463         if (*p1 > *p2) {
  464                 tmp = *p2;
  465                 *p2 = *p1;
  466                 *p1 = tmp;
  467         }
  468 }
  469 
  470 static void
  471 cache_lock_all_buckets(void)
  472 {
  473         u_int i;
  474 
  475         for (i = 0; i < numbucketlocks; i++)
  476                 rw_wlock(&bucketlocks[i]);
  477 }
  478 
  479 static void
  480 cache_unlock_all_buckets(void)
  481 {
  482         u_int i;
  483 
  484         for (i = 0; i < numbucketlocks; i++)
  485                 rw_wunlock(&bucketlocks[i]);
  486 }
  487 
  488 static void
  489 cache_lock_all_vnodes(void)
  490 {
  491         u_int i;
  492 
  493         for (i = 0; i < numvnodelocks; i++)
  494                 mtx_lock(&vnodelocks[i]);
  495 }
  496 
  497 static void
  498 cache_unlock_all_vnodes(void)
  499 {
  500         u_int i;
  501 
  502         for (i = 0; i < numvnodelocks; i++)
  503                 mtx_unlock(&vnodelocks[i]);
  504 }
  505 
  506 static int
  507 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  508 {
  509 
  510         cache_sort(&vlp1, &vlp2);
  511         MPASS(vlp2 != NULL);
  512 
  513         if (vlp1 != NULL) {
  514                 if (!mtx_trylock(vlp1))
  515                         return (EAGAIN);
  516         }
  517         if (!mtx_trylock(vlp2)) {
  518                 if (vlp1 != NULL)
  519                         mtx_unlock(vlp1);
  520                 return (EAGAIN);
  521         }
  522 
  523         return (0);
  524 }
  525 
  526 static void
  527 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
  528 {
  529 
  530         MPASS(vlp1 != NULL || vlp2 != NULL);
  531 
  532         if (vlp1 != NULL)
  533                 mtx_unlock(vlp1);
  534         if (vlp2 != NULL)
  535                 mtx_unlock(vlp2);
  536 }
  537 
  538 static int
  539 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  540 {
  541         struct nchstats snap;
  542 
  543         if (req->oldptr == NULL)
  544                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
  545 
  546         snap = nchstats;
  547         snap.ncs_goodhits = counter_u64_fetch(numposhits);
  548         snap.ncs_neghits = counter_u64_fetch(numneghits);
  549         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
  550             counter_u64_fetch(numnegzaps);
  551         snap.ncs_miss = counter_u64_fetch(nummisszap) +
  552             counter_u64_fetch(nummiss);
  553 
  554         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
  555 }
  556 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
  557     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
  558     "VFS cache effectiveness statistics");
  559 
  560 #ifdef DIAGNOSTIC
  561 /*
  562  * Grab an atomic snapshot of the name cache hash chain lengths
  563  */
  564 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
  565     "hash table stats");
  566 
  567 static int
  568 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
  569 {
  570         struct nchashhead *ncpp;
  571         struct namecache *ncp;
  572         int i, error, n_nchash, *cntbuf;
  573 
  574 retry:
  575         n_nchash = nchash + 1;  /* nchash is max index, not count */
  576         if (req->oldptr == NULL)
  577                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
  578         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
  579         cache_lock_all_buckets();
  580         if (n_nchash != nchash + 1) {
  581                 cache_unlock_all_buckets();
  582                 free(cntbuf, M_TEMP);
  583                 goto retry;
  584         }
  585         /* Scan hash tables counting entries */
  586         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
  587                 LIST_FOREACH(ncp, ncpp, nc_hash)
  588                         cntbuf[i]++;
  589         cache_unlock_all_buckets();
  590         for (error = 0, i = 0; i < n_nchash; i++)
  591                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
  592                         break;
  593         free(cntbuf, M_TEMP);
  594         return (error);
  595 }
  596 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
  597     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
  598     "nchash chain lengths");
  599 
  600 static int
  601 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
  602 {
  603         int error;
  604         struct nchashhead *ncpp;
  605         struct namecache *ncp;
  606         int n_nchash;
  607         int count, maxlength, used, pct;
  608 
  609         if (!req->oldptr)
  610                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
  611 
  612         cache_lock_all_buckets();
  613         n_nchash = nchash + 1;  /* nchash is max index, not count */
  614         used = 0;
  615         maxlength = 0;
  616 
  617         /* Scan hash tables for applicable entries */
  618         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
  619                 count = 0;
  620                 LIST_FOREACH(ncp, ncpp, nc_hash) {
  621                         count++;
  622                 }
  623                 if (count)
  624                         used++;
  625                 if (maxlength < count)
  626                         maxlength = count;
  627         }
  628         n_nchash = nchash + 1;
  629         cache_unlock_all_buckets();
  630         pct = (used * 100) / (n_nchash / 100);
  631         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
  632         if (error)
  633                 return (error);
  634         error = SYSCTL_OUT(req, &used, sizeof(used));
  635         if (error)
  636                 return (error);
  637         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
  638         if (error)
  639                 return (error);
  640         error = SYSCTL_OUT(req, &pct, sizeof(pct));
  641         if (error)
  642                 return (error);
  643         return (0);
  644 }
  645 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
  646     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
  647     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
  648 #endif
  649 
  650 /*
  651  * Negative entries management
  652  *
  653  * A variation of LRU scheme is used. New entries are hashed into one of
  654  * numneglists cold lists. Entries get promoted to the hot list on first hit.
  655  * Partial LRU for the hot list is maintained by requeueing them every
  656  * ncneghitsrequeue hits.
  657  *
  658  * The shrinker will demote hot list head and evict from the cold list in a
  659  * round-robin manner.
  660  */
  661 static void
  662 cache_negative_hit(struct namecache *ncp)
  663 {
  664         struct neglist *neglist;
  665         u_int hits;
  666 
  667         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  668         hits = atomic_fetchadd_int(&ncp->nc_neghits, 1);
  669         if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  670                 if ((hits % ncneghitsrequeue) != 0)
  671                         return;
  672                 mtx_lock(&ncneg_hot.nl_lock);
  673                 if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  674                         TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  675                         TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
  676                         mtx_unlock(&ncneg_hot.nl_lock);
  677                         return;
  678                 }
  679                 /*
  680                  * The shrinker cleared the flag and removed the entry from
  681                  * the hot list. Put it back.
  682                  */
  683         } else {
  684                 mtx_lock(&ncneg_hot.nl_lock);
  685         }
  686         neglist = NCP2NEGLIST(ncp);
  687         mtx_lock(&neglist->nl_lock);
  688         if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
  689                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
  690                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
  691                 ncp->nc_flag |= NCF_HOTNEGATIVE;
  692         }
  693         mtx_unlock(&neglist->nl_lock);
  694         mtx_unlock(&ncneg_hot.nl_lock);
  695 }
  696 
  697 static void
  698 cache_negative_insert(struct namecache *ncp, bool neg_locked)
  699 {
  700         struct neglist *neglist;
  701 
  702         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  703         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  704         neglist = NCP2NEGLIST(ncp);
  705         if (!neg_locked) {
  706                 mtx_lock(&neglist->nl_lock);
  707         } else {
  708                 mtx_assert(&neglist->nl_lock, MA_OWNED);
  709         }
  710         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
  711         if (!neg_locked)
  712                 mtx_unlock(&neglist->nl_lock);
  713         atomic_add_rel_long(&numneg, 1);
  714 }
  715 
  716 static void
  717 cache_negative_remove(struct namecache *ncp, bool neg_locked)
  718 {
  719         struct neglist *neglist;
  720         bool hot_locked = false;
  721         bool list_locked = false;
  722 
  723         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  724         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  725         neglist = NCP2NEGLIST(ncp);
  726         if (!neg_locked) {
  727                 if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  728                         hot_locked = true;
  729                         mtx_lock(&ncneg_hot.nl_lock);
  730                         if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
  731                                 list_locked = true;
  732                                 mtx_lock(&neglist->nl_lock);
  733                         }
  734                 } else {
  735                         list_locked = true;
  736                         mtx_lock(&neglist->nl_lock);
  737                 }
  738         }
  739         if (ncp->nc_flag & NCF_HOTNEGATIVE) {
  740                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
  741                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  742         } else {
  743                 mtx_assert(&neglist->nl_lock, MA_OWNED);
  744                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
  745         }
  746         if (list_locked)
  747                 mtx_unlock(&neglist->nl_lock);
  748         if (hot_locked)
  749                 mtx_unlock(&ncneg_hot.nl_lock);
  750         atomic_subtract_rel_long(&numneg, 1);
  751 }
  752 
  753 static void
  754 cache_negative_shrink_select(int start, struct namecache **ncpp,
  755     struct neglist **neglistpp)
  756 {
  757         struct neglist *neglist;
  758         struct namecache *ncp;
  759         int i;
  760 
  761         *ncpp = ncp = NULL;
  762 
  763         for (i = start; i < numneglists; i++) {
  764                 neglist = &neglists[i];
  765                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
  766                         continue;
  767                 mtx_lock(&neglist->nl_lock);
  768                 ncp = TAILQ_FIRST(&neglist->nl_list);
  769                 if (ncp != NULL)
  770                         break;
  771                 mtx_unlock(&neglist->nl_lock);
  772         }
  773 
  774         *neglistpp = neglist;
  775         *ncpp = ncp;
  776 }
  777 
  778 static void
  779 cache_negative_zap_one(void)
  780 {
  781         struct namecache *ncp, *ncp2;
  782         struct neglist *neglist;
  783         struct mtx *dvlp;
  784         struct rwlock *blp;
  785 
  786         if (!mtx_trylock(&ncneg_shrink_lock))
  787                 return;
  788 
  789         mtx_lock(&ncneg_hot.nl_lock);
  790         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
  791         if (ncp != NULL) {
  792                 neglist = NCP2NEGLIST(ncp);
  793                 mtx_lock(&neglist->nl_lock);
  794                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
  795                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
  796                 ncp->nc_flag &= ~NCF_HOTNEGATIVE;
  797                 mtx_unlock(&neglist->nl_lock);
  798         }
  799 
  800         cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
  801         shrink_list_turn++;
  802         if (shrink_list_turn == numneglists)
  803                 shrink_list_turn = 0;
  804         if (ncp == NULL && shrink_list_turn == 0)
  805                 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
  806         if (ncp == NULL) {
  807                 mtx_unlock(&ncneg_hot.nl_lock);
  808                 goto out;
  809         }
  810 
  811         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  812         dvlp = VP2VNODELOCK(ncp->nc_dvp);
  813         blp = NCP2BUCKETLOCK(ncp);
  814         mtx_unlock(&neglist->nl_lock);
  815         mtx_unlock(&ncneg_hot.nl_lock);
  816         mtx_lock(dvlp);
  817         rw_wlock(blp);
  818         mtx_lock(&neglist->nl_lock);
  819         ncp2 = TAILQ_FIRST(&neglist->nl_list);
  820         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
  821             blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
  822                 ncp = NULL;
  823                 goto out_unlock_all;
  824         }
  825         SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
  826             nc_get_name(ncp), ncp->nc_neghits);
  827 
  828         cache_zap_locked(ncp, true);
  829 out_unlock_all:
  830         mtx_unlock(&neglist->nl_lock);
  831         rw_wunlock(blp);
  832         mtx_unlock(dvlp);
  833 out:
  834         mtx_unlock(&ncneg_shrink_lock);
  835         cache_free(ncp);
  836 }
  837 
  838 /*
  839  * cache_zap_locked():
  840  *
  841  *   Removes a namecache entry from cache, whether it contains an actual
  842  *   pointer to a vnode or if it is just a negative cache entry.
  843  */
  844 static void
  845 cache_zap_locked(struct namecache *ncp, bool neg_locked)
  846 {
  847 
  848         if (!(ncp->nc_flag & NCF_NEGATIVE))
  849                 cache_assert_vnode_locked(ncp->nc_vp);
  850         cache_assert_vnode_locked(ncp->nc_dvp);
  851         cache_assert_bucket_locked(ncp, RA_WLOCKED);
  852 
  853         CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
  854             (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
  855         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
  856                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
  857                     nc_get_name(ncp), ncp->nc_vp);
  858         } else {
  859                 SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp,
  860                     nc_get_name(ncp), ncp->nc_neghits);
  861         }
  862         LIST_REMOVE(ncp, nc_hash);
  863         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
  864                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
  865                 if (ncp == ncp->nc_vp->v_cache_dd)
  866                         ncp->nc_vp->v_cache_dd = NULL;
  867         } else {
  868                 cache_negative_remove(ncp, neg_locked);
  869         }
  870         if (ncp->nc_flag & NCF_ISDOTDOT) {
  871                 if (ncp == ncp->nc_dvp->v_cache_dd)
  872                         ncp->nc_dvp->v_cache_dd = NULL;
  873         } else {
  874                 LIST_REMOVE(ncp, nc_src);
  875                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
  876                         ncp->nc_flag |= NCF_DVDROP;
  877                         atomic_subtract_rel_long(&numcachehv, 1);
  878                 }
  879         }
  880         atomic_subtract_rel_long(&numcache, 1);
  881 }
  882 
  883 static void
  884 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
  885 {
  886         struct rwlock *blp;
  887 
  888         MPASS(ncp->nc_dvp == vp);
  889         MPASS(ncp->nc_flag & NCF_NEGATIVE);
  890         cache_assert_vnode_locked(vp);
  891 
  892         blp = NCP2BUCKETLOCK(ncp);
  893         rw_wlock(blp);
  894         cache_zap_locked(ncp, false);
  895         rw_wunlock(blp);
  896 }
  897 
  898 static bool
  899 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
  900     struct mtx **vlpp)
  901 {
  902         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
  903         struct rwlock *blp;
  904 
  905         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
  906         cache_assert_vnode_locked(vp);
  907 
  908         if (ncp->nc_flag & NCF_NEGATIVE) {
  909                 if (*vlpp != NULL) {
  910                         mtx_unlock(*vlpp);
  911                         *vlpp = NULL;
  912                 }
  913                 cache_zap_negative_locked_vnode_kl(ncp, vp);
  914                 return (true);
  915         }
  916 
  917         pvlp = VP2VNODELOCK(vp);
  918         blp = NCP2BUCKETLOCK(ncp);
  919         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
  920         vlp2 = VP2VNODELOCK(ncp->nc_vp);
  921 
  922         if (*vlpp == vlp1 || *vlpp == vlp2) {
  923                 to_unlock = *vlpp;
  924                 *vlpp = NULL;
  925         } else {
  926                 if (*vlpp != NULL) {
  927                         mtx_unlock(*vlpp);
  928                         *vlpp = NULL;
  929                 }
  930                 cache_sort(&vlp1, &vlp2);
  931                 if (vlp1 == pvlp) {
  932                         mtx_lock(vlp2);
  933                         to_unlock = vlp2;
  934                 } else {
  935                         if (!mtx_trylock(vlp1))
  936                                 goto out_relock;
  937                         to_unlock = vlp1;
  938                 }
  939         }
  940         rw_wlock(blp);
  941         cache_zap_locked(ncp, false);
  942         rw_wunlock(blp);
  943         if (to_unlock != NULL)
  944                 mtx_unlock(to_unlock);
  945         return (true);
  946 
  947 out_relock:
  948         mtx_unlock(vlp2);
  949         mtx_lock(vlp1);
  950         mtx_lock(vlp2);
  951         MPASS(*vlpp == NULL);
  952         *vlpp = vlp1;
  953         return (false);
  954 }
  955 
  956 static int
  957 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
  958 {
  959         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
  960         struct rwlock *blp;
  961         int error = 0;
  962 
  963         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
  964         cache_assert_vnode_locked(vp);
  965 
  966         pvlp = VP2VNODELOCK(vp);
  967         if (ncp->nc_flag & NCF_NEGATIVE) {
  968                 cache_zap_negative_locked_vnode_kl(ncp, vp);
  969                 goto out;
  970         }
  971 
  972         blp = NCP2BUCKETLOCK(ncp);
  973         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
  974         vlp2 = VP2VNODELOCK(ncp->nc_vp);
  975         cache_sort(&vlp1, &vlp2);
  976         if (vlp1 == pvlp) {
  977                 mtx_lock(vlp2);
  978                 to_unlock = vlp2;
  979         } else {
  980                 if (!mtx_trylock(vlp1)) {
  981                         error = EAGAIN;
  982                         goto out;
  983                 }
  984                 to_unlock = vlp1;
  985         }
  986         rw_wlock(blp);
  987         cache_zap_locked(ncp, false);
  988         rw_wunlock(blp);
  989         mtx_unlock(to_unlock);
  990 out:
  991         mtx_unlock(pvlp);
  992         return (error);
  993 }
  994 
  995 static int
  996 cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp)
  997 {
  998         struct mtx *dvlp, *vlp;
  999 
 1000         cache_assert_bucket_locked(ncp, RA_RLOCKED);
 1001 
 1002         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1003         vlp = NULL;
 1004         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1005                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1006         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1007                 rw_runlock(blp);
 1008                 rw_wlock(blp);
 1009                 cache_zap_locked(ncp, false);
 1010                 rw_wunlock(blp);
 1011                 cache_unlock_vnodes(dvlp, vlp);
 1012                 return (0);
 1013         }
 1014 
 1015         rw_runlock(blp);
 1016         return (EAGAIN);
 1017 }
 1018 
 1019 static int
 1020 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
 1021     struct mtx **vlpp1, struct mtx **vlpp2)
 1022 {
 1023         struct mtx *dvlp, *vlp;
 1024 
 1025         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 1026 
 1027         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 1028         vlp = NULL;
 1029         if (!(ncp->nc_flag & NCF_NEGATIVE))
 1030                 vlp = VP2VNODELOCK(ncp->nc_vp);
 1031         cache_sort(&dvlp, &vlp);
 1032 
 1033         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
 1034                 cache_zap_locked(ncp, false);
 1035                 cache_unlock_vnodes(dvlp, vlp);
 1036                 *vlpp1 = NULL;
 1037                 *vlpp2 = NULL;
 1038                 return (0);
 1039         }
 1040 
 1041         if (*vlpp1 != NULL)
 1042                 mtx_unlock(*vlpp1);
 1043         if (*vlpp2 != NULL)
 1044                 mtx_unlock(*vlpp2);
 1045         *vlpp1 = NULL;
 1046         *vlpp2 = NULL;
 1047 
 1048         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 1049                 cache_zap_locked(ncp, false);
 1050                 cache_unlock_vnodes(dvlp, vlp);
 1051                 return (0);
 1052         }
 1053 
 1054         rw_wunlock(blp);
 1055         *vlpp1 = dvlp;
 1056         *vlpp2 = vlp;
 1057         if (*vlpp1 != NULL)
 1058                 mtx_lock(*vlpp1);
 1059         mtx_lock(*vlpp2);
 1060         rw_wlock(blp);
 1061         return (EAGAIN);
 1062 }
 1063 
 1064 static void
 1065 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
 1066 {
 1067 
 1068         if (blp != NULL) {
 1069                 rw_runlock(blp);
 1070                 mtx_assert(vlp, MA_NOTOWNED);
 1071         } else {
 1072                 mtx_unlock(vlp);
 1073         }
 1074 }
 1075 
 1076 /*
 1077  * Lookup an entry in the cache
 1078  *
 1079  * Lookup is called with dvp pointing to the directory to search,
 1080  * cnp pointing to the name of the entry being sought. If the lookup
 1081  * succeeds, the vnode is returned in *vpp, and a status of -1 is
 1082  * returned. If the lookup determines that the name does not exist
 1083  * (negative caching), a status of ENOENT is returned. If the lookup
 1084  * fails, a status of zero is returned.  If the directory vnode is
 1085  * recycled out from under us due to a forced unmount, a status of
 1086  * ENOENT is returned.
 1087  *
 1088  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
 1089  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
 1090  * not recursively acquired.
 1091  */
 1092 
 1093 int
 1094 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
 1095     struct timespec *tsp, int *ticksp)
 1096 {
 1097         struct namecache *ncp;
 1098         struct rwlock *blp;
 1099         struct mtx *dvlp, *dvlp2;
 1100         uint32_t hash;
 1101         int error, ltype;
 1102 
 1103         if (__predict_false(!doingcache)) {
 1104                 cnp->cn_flags &= ~MAKEENTRY;
 1105                 return (0);
 1106         }
 1107 retry:
 1108         blp = NULL;
 1109         dvlp = VP2VNODELOCK(dvp);
 1110         error = 0;
 1111         counter_u64_add(numcalls, 1);
 1112 
 1113         if (cnp->cn_nameptr[0] == '.') {
 1114                 if (cnp->cn_namelen == 1) {
 1115                         *vpp = dvp;
 1116                         CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 1117                             dvp, cnp->cn_nameptr);
 1118                         counter_u64_add(dothits, 1);
 1119                         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 1120                         if (tsp != NULL)
 1121                                 timespecclear(tsp);
 1122                         if (ticksp != NULL)
 1123                                 *ticksp = ticks;
 1124                         vrefact(*vpp);
 1125                         /*
 1126                          * When we lookup "." we still can be asked to lock it
 1127                          * differently...
 1128                          */
 1129                         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 1130                         if (ltype != VOP_ISLOCKED(*vpp)) {
 1131                                 if (ltype == LK_EXCLUSIVE) {
 1132                                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 1133                                         if ((*vpp)->v_iflag & VI_DOOMED) {
 1134                                                 /* forced unmount */
 1135                                                 vrele(*vpp);
 1136                                                 *vpp = NULL;
 1137                                                 return (ENOENT);
 1138                                         }
 1139                                 } else
 1140                                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 1141                         }
 1142                         return (-1);
 1143                 }
 1144                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 1145                         counter_u64_add(dotdothits, 1);
 1146                         dvlp2 = NULL;
 1147                         mtx_lock(dvlp);
 1148 retry_dotdot:
 1149                         ncp = dvp->v_cache_dd;
 1150                         if (ncp == NULL) {
 1151                                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 1152                                     "..", NULL);
 1153                                 mtx_unlock(dvlp);
 1154                                 return (0);
 1155                         }
 1156                         if ((cnp->cn_flags & MAKEENTRY) == 0) {
 1157                                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1158                                         if (ncp->nc_dvp != dvp)
 1159                                                 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
 1160                                         if (!cache_zap_locked_vnode_kl2(ncp,
 1161                                             dvp, &dvlp2))
 1162                                                 goto retry_dotdot;
 1163                                         MPASS(dvp->v_cache_dd == NULL);
 1164                                         mtx_unlock(dvlp);
 1165                                         if (dvlp2 != NULL)
 1166                                                 mtx_unlock(dvlp2);
 1167                                         cache_free(ncp);
 1168                                 } else {
 1169                                         dvp->v_cache_dd = NULL;
 1170                                         mtx_unlock(dvlp);
 1171                                         if (dvlp2 != NULL)
 1172                                                 mtx_unlock(dvlp2);
 1173                                 }
 1174                                 return (0);
 1175                         }
 1176                         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 1177                                 if (ncp->nc_flag & NCF_NEGATIVE)
 1178                                         *vpp = NULL;
 1179                                 else
 1180                                         *vpp = ncp->nc_vp;
 1181                         } else
 1182                                 *vpp = ncp->nc_dvp;
 1183                         /* Return failure if negative entry was found. */
 1184                         if (*vpp == NULL)
 1185                                 goto negative_success;
 1186                         CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 1187                             dvp, cnp->cn_nameptr, *vpp);
 1188                         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
 1189                             *vpp);
 1190                         cache_out_ts(ncp, tsp, ticksp);
 1191                         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 1192                             NCF_DTS && tsp != NULL)
 1193                                 *tsp = ((struct namecache_ts *)ncp)->
 1194                                     nc_dotdottime;
 1195                         goto success;
 1196                 }
 1197         }
 1198 
 1199         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 1200         blp = HASH2BUCKETLOCK(hash);
 1201         rw_rlock(blp);
 1202 
 1203         LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 1204                 counter_u64_add(numchecks, 1);
 1205                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 1206                     !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen))
 1207                         break;
 1208         }
 1209 
 1210         /* We failed to find an entry */
 1211         if (ncp == NULL) {
 1212                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 1213                     NULL);
 1214                 if ((cnp->cn_flags & MAKEENTRY) == 0) {
 1215                         counter_u64_add(nummisszap, 1);
 1216                 } else {
 1217                         counter_u64_add(nummiss, 1);
 1218                 }
 1219                 goto unlock;
 1220         }
 1221 
 1222         /* We don't want to have an entry, so dump it */
 1223         if ((cnp->cn_flags & MAKEENTRY) == 0) {
 1224                 counter_u64_add(numposzaps, 1);
 1225                 goto zap_and_exit;
 1226         }
 1227 
 1228         /* We found a "positive" match, return the vnode */
 1229         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1230                 counter_u64_add(numposhits, 1);
 1231                 *vpp = ncp->nc_vp;
 1232                 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 1233                     dvp, cnp->cn_nameptr, *vpp, ncp);
 1234                 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
 1235                     *vpp);
 1236                 cache_out_ts(ncp, tsp, ticksp);
 1237                 goto success;
 1238         }
 1239 
 1240 negative_success:
 1241         /* We found a negative match, and want to create it, so purge */
 1242         if (cnp->cn_nameiop == CREATE) {
 1243                 counter_u64_add(numnegzaps, 1);
 1244                 goto zap_and_exit;
 1245         }
 1246 
 1247         counter_u64_add(numneghits, 1);
 1248         cache_negative_hit(ncp);
 1249         if (ncp->nc_flag & NCF_WHITE)
 1250                 cnp->cn_flags |= ISWHITEOUT;
 1251         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 1252             nc_get_name(ncp));
 1253         cache_out_ts(ncp, tsp, ticksp);
 1254         cache_lookup_unlock(blp, dvlp);
 1255         return (ENOENT);
 1256 
 1257 success:
 1258         /*
 1259          * On success we return a locked and ref'd vnode as per the lookup
 1260          * protocol.
 1261          */
 1262         MPASS(dvp != *vpp);
 1263         ltype = 0;      /* silence gcc warning */
 1264         if (cnp->cn_flags & ISDOTDOT) {
 1265                 ltype = VOP_ISLOCKED(dvp);
 1266                 VOP_UNLOCK(dvp, 0);
 1267         }
 1268         vhold(*vpp);
 1269         cache_lookup_unlock(blp, dvlp);
 1270         error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
 1271         if (cnp->cn_flags & ISDOTDOT) {
 1272                 vn_lock(dvp, ltype | LK_RETRY);
 1273                 if (dvp->v_iflag & VI_DOOMED) {
 1274                         if (error == 0)
 1275                                 vput(*vpp);
 1276                         *vpp = NULL;
 1277                         return (ENOENT);
 1278                 }
 1279         }
 1280         if (error) {
 1281                 *vpp = NULL;
 1282                 goto retry;
 1283         }
 1284         if ((cnp->cn_flags & ISLASTCN) &&
 1285             (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
 1286                 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 1287         }
 1288         return (-1);
 1289 
 1290 unlock:
 1291         cache_lookup_unlock(blp, dvlp);
 1292         return (0);
 1293 
 1294 zap_and_exit:
 1295         if (blp != NULL)
 1296                 error = cache_zap_rlocked_bucket(ncp, blp);
 1297         else
 1298                 error = cache_zap_locked_vnode(ncp, dvp);
 1299         if (error != 0) {
 1300                 zap_and_exit_bucket_fail++;
 1301                 cache_maybe_yield();
 1302                 goto retry;
 1303         }
 1304         cache_free(ncp);
 1305         return (0);
 1306 }
 1307 
 1308 struct celockstate {
 1309         struct mtx *vlp[3];
 1310         struct rwlock *blp[2];
 1311 };
 1312 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 1313 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 1314 
 1315 static inline void
 1316 cache_celockstate_init(struct celockstate *cel)
 1317 {
 1318 
 1319         bzero(cel, sizeof(*cel));
 1320 }
 1321 
 1322 static void
 1323 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
 1324     struct vnode *dvp)
 1325 {
 1326         struct mtx *vlp1, *vlp2;
 1327 
 1328         MPASS(cel->vlp[0] == NULL);
 1329         MPASS(cel->vlp[1] == NULL);
 1330         MPASS(cel->vlp[2] == NULL);
 1331 
 1332         MPASS(vp != NULL || dvp != NULL);
 1333 
 1334         vlp1 = VP2VNODELOCK(vp);
 1335         vlp2 = VP2VNODELOCK(dvp);
 1336         cache_sort(&vlp1, &vlp2);
 1337 
 1338         if (vlp1 != NULL) {
 1339                 mtx_lock(vlp1);
 1340                 cel->vlp[0] = vlp1;
 1341         }
 1342         mtx_lock(vlp2);
 1343         cel->vlp[1] = vlp2;
 1344 }
 1345 
 1346 static void
 1347 cache_unlock_vnodes_cel(struct celockstate *cel)
 1348 {
 1349 
 1350         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 1351 
 1352         if (cel->vlp[0] != NULL)
 1353                 mtx_unlock(cel->vlp[0]);
 1354         if (cel->vlp[1] != NULL)
 1355                 mtx_unlock(cel->vlp[1]);
 1356         if (cel->vlp[2] != NULL)
 1357                 mtx_unlock(cel->vlp[2]);
 1358 }
 1359 
 1360 static bool
 1361 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 1362 {
 1363         struct mtx *vlp;
 1364         bool ret;
 1365 
 1366         cache_assert_vlp_locked(cel->vlp[0]);
 1367         cache_assert_vlp_locked(cel->vlp[1]);
 1368         MPASS(cel->vlp[2] == NULL);
 1369 
 1370         MPASS(vp != NULL);
 1371         vlp = VP2VNODELOCK(vp);
 1372 
 1373         ret = true;
 1374         if (vlp >= cel->vlp[1]) {
 1375                 mtx_lock(vlp);
 1376         } else {
 1377                 if (mtx_trylock(vlp))
 1378                         goto out;
 1379                 cache_lock_vnodes_cel_3_failures++;
 1380                 cache_unlock_vnodes_cel(cel);
 1381                 if (vlp < cel->vlp[0]) {
 1382                         mtx_lock(vlp);
 1383                         mtx_lock(cel->vlp[0]);
 1384                         mtx_lock(cel->vlp[1]);
 1385                 } else {
 1386                         if (cel->vlp[0] != NULL)
 1387                                 mtx_lock(cel->vlp[0]);
 1388                         mtx_lock(vlp);
 1389                         mtx_lock(cel->vlp[1]);
 1390                 }
 1391                 ret = false;
 1392         }
 1393 out:
 1394         cel->vlp[2] = vlp;
 1395         return (ret);
 1396 }
 1397 
 1398 static void
 1399 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
 1400     struct rwlock *blp2)
 1401 {
 1402 
 1403         MPASS(cel->blp[0] == NULL);
 1404         MPASS(cel->blp[1] == NULL);
 1405 
 1406         cache_sort(&blp1, &blp2);
 1407 
 1408         if (blp1 != NULL) {
 1409                 rw_wlock(blp1);
 1410                 cel->blp[0] = blp1;
 1411         }
 1412         rw_wlock(blp2);
 1413         cel->blp[1] = blp2;
 1414 }
 1415 
 1416 static void
 1417 cache_unlock_buckets_cel(struct celockstate *cel)
 1418 {
 1419 
 1420         if (cel->blp[0] != NULL)
 1421                 rw_wunlock(cel->blp[0]);
 1422         rw_wunlock(cel->blp[1]);
 1423 }
 1424 
 1425 /*
 1426  * Lock part of the cache affected by the insertion.
 1427  *
 1428  * This means vnodelocks for dvp, vp and the relevant bucketlock.
 1429  * However, insertion can result in removal of an old entry. In this
 1430  * case we have an additional vnode and bucketlock pair to lock. If the
 1431  * entry is negative, ncelock is locked instead of the vnode.
 1432  *
 1433  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
 1434  * preserving the locking order (smaller address first).
 1435  */
 1436 static void
 1437 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 1438     uint32_t hash)
 1439 {
 1440         struct namecache *ncp;
 1441         struct rwlock *blps[2];
 1442 
 1443         blps[0] = HASH2BUCKETLOCK(hash);
 1444         for (;;) {
 1445                 blps[1] = NULL;
 1446                 cache_lock_vnodes_cel(cel, dvp, vp);
 1447                 if (vp == NULL || vp->v_type != VDIR)
 1448                         break;
 1449                 ncp = vp->v_cache_dd;
 1450                 if (ncp == NULL)
 1451                         break;
 1452                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 1453                         break;
 1454                 MPASS(ncp->nc_dvp == vp);
 1455                 blps[1] = NCP2BUCKETLOCK(ncp);
 1456                 if (ncp->nc_flag & NCF_NEGATIVE)
 1457                         break;
 1458                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 1459                         break;
 1460                 /*
 1461                  * All vnodes got re-locked. Re-validate the state and if
 1462                  * nothing changed we are done. Otherwise restart.
 1463                  */
 1464                 if (ncp == vp->v_cache_dd &&
 1465                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 1466                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 1467                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 1468                         break;
 1469                 cache_unlock_vnodes_cel(cel);
 1470                 cel->vlp[0] = NULL;
 1471                 cel->vlp[1] = NULL;
 1472                 cel->vlp[2] = NULL;
 1473         }
 1474         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 1475 }
 1476 
 1477 static void
 1478 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
 1479     uint32_t hash)
 1480 {
 1481         struct namecache *ncp;
 1482         struct rwlock *blps[2];
 1483 
 1484         blps[0] = HASH2BUCKETLOCK(hash);
 1485         for (;;) {
 1486                 blps[1] = NULL;
 1487                 cache_lock_vnodes_cel(cel, dvp, vp);
 1488                 ncp = dvp->v_cache_dd;
 1489                 if (ncp == NULL)
 1490                         break;
 1491                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 1492                         break;
 1493                 MPASS(ncp->nc_dvp == dvp);
 1494                 blps[1] = NCP2BUCKETLOCK(ncp);
 1495                 if (ncp->nc_flag & NCF_NEGATIVE)
 1496                         break;
 1497                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 1498                         break;
 1499                 if (ncp == dvp->v_cache_dd &&
 1500                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 1501                     blps[1] == NCP2BUCKETLOCK(ncp) &&
 1502                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 1503                         break;
 1504                 cache_unlock_vnodes_cel(cel);
 1505                 cel->vlp[0] = NULL;
 1506                 cel->vlp[1] = NULL;
 1507                 cel->vlp[2] = NULL;
 1508         }
 1509         cache_lock_buckets_cel(cel, blps[0], blps[1]);
 1510 }
 1511 
 1512 static void
 1513 cache_enter_unlock(struct celockstate *cel)
 1514 {
 1515 
 1516         cache_unlock_buckets_cel(cel);
 1517         cache_unlock_vnodes_cel(cel);
 1518 }
 1519 
 1520 /*
 1521  * Add an entry to the cache.
 1522  */
 1523 void
 1524 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
 1525     struct timespec *tsp, struct timespec *dtsp)
 1526 {
 1527         struct celockstate cel;
 1528         struct namecache *ncp, *n2, *ndd;
 1529         struct namecache_ts *n3;
 1530         struct nchashhead *ncpp;
 1531         struct neglist *neglist;
 1532         uint32_t hash;
 1533         int flag;
 1534         int len;
 1535         bool neg_locked;
 1536 
 1537         CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 1538         VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 1539             ("cache_enter: Adding a doomed vnode"));
 1540         VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
 1541             ("cache_enter: Doomed vnode used as src"));
 1542 
 1543         if (__predict_false(!doingcache))
 1544                 return;
 1545 
 1546         /*
 1547          * Avoid blowout in namecache entries.
 1548          */
 1549         if (__predict_false(numcache >= desiredvnodes * ncsizefactor))
 1550                 return;
 1551 
 1552         cache_celockstate_init(&cel);
 1553         ndd = NULL;
 1554         flag = 0;
 1555         if (cnp->cn_nameptr[0] == '.') {
 1556                 if (cnp->cn_namelen == 1)
 1557                         return;
 1558                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 1559                         len = cnp->cn_namelen;
 1560                         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 1561                         cache_enter_lock_dd(&cel, dvp, vp, hash);
 1562                         /*
 1563                          * If dotdot entry already exists, just retarget it
 1564                          * to new parent vnode, otherwise continue with new
 1565                          * namecache entry allocation.
 1566                          */
 1567                         if ((ncp = dvp->v_cache_dd) != NULL &&
 1568                             ncp->nc_flag & NCF_ISDOTDOT) {
 1569                                 KASSERT(ncp->nc_dvp == dvp,
 1570                                     ("wrong isdotdot parent"));
 1571                                 neg_locked = false;
 1572                                 if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) {
 1573                                         neglist = NCP2NEGLIST(ncp);
 1574                                         mtx_lock(&ncneg_hot.nl_lock);
 1575                                         mtx_lock(&neglist->nl_lock);
 1576                                         neg_locked = true;
 1577                                 }
 1578                                 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 1579                                         TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
 1580                                             ncp, nc_dst);
 1581                                 } else {
 1582                                         cache_negative_remove(ncp, true);
 1583                                 }
 1584                                 if (vp != NULL) {
 1585                                         TAILQ_INSERT_HEAD(&vp->v_cache_dst,
 1586                                             ncp, nc_dst);
 1587                                         ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE);
 1588                                 } else {
 1589                                         ncp->nc_flag &= ~(NCF_HOTNEGATIVE);
 1590                                         ncp->nc_flag |= NCF_NEGATIVE;
 1591                                         cache_negative_insert(ncp, true);
 1592                                 }
 1593                                 if (neg_locked) {
 1594                                         mtx_unlock(&neglist->nl_lock);
 1595                                         mtx_unlock(&ncneg_hot.nl_lock);
 1596                                 }
 1597                                 ncp->nc_vp = vp;
 1598                                 cache_enter_unlock(&cel);
 1599                                 return;
 1600                         }
 1601                         dvp->v_cache_dd = NULL;
 1602                         cache_enter_unlock(&cel);
 1603                         cache_celockstate_init(&cel);
 1604                         SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
 1605                         flag = NCF_ISDOTDOT;
 1606                 }
 1607         }
 1608 
 1609         /*
 1610          * Calculate the hash key and setup as much of the new
 1611          * namecache entry as possible before acquiring the lock.
 1612          */
 1613         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 1614         ncp->nc_flag = flag;
 1615         ncp->nc_vp = vp;
 1616         if (vp == NULL)
 1617                 ncp->nc_flag |= NCF_NEGATIVE;
 1618         ncp->nc_dvp = dvp;
 1619         if (tsp != NULL) {
 1620                 n3 = (struct namecache_ts *)ncp;
 1621                 n3->nc_time = *tsp;
 1622                 n3->nc_ticks = ticks;
 1623                 n3->nc_flag |= NCF_TS;
 1624                 if (dtsp != NULL) {
 1625                         n3->nc_dotdottime = *dtsp;
 1626                         n3->nc_flag |= NCF_DTS;
 1627                 }
 1628         }
 1629         len = ncp->nc_nlen = cnp->cn_namelen;
 1630         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 1631         strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1);
 1632         cache_enter_lock(&cel, dvp, vp, hash);
 1633 
 1634         /*
 1635          * See if this vnode or negative entry is already in the cache
 1636          * with this name.  This can happen with concurrent lookups of
 1637          * the same path name.
 1638          */
 1639         ncpp = NCHHASH(hash);
 1640         LIST_FOREACH(n2, ncpp, nc_hash) {
 1641                 if (n2->nc_dvp == dvp &&
 1642                     n2->nc_nlen == cnp->cn_namelen &&
 1643                     !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) {
 1644                         if (tsp != NULL) {
 1645                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
 1646                                     ("no NCF_TS"));
 1647                                 n3 = (struct namecache_ts *)n2;
 1648                                 n3->nc_time =
 1649                                     ((struct namecache_ts *)ncp)->nc_time;
 1650                                 n3->nc_ticks =
 1651                                     ((struct namecache_ts *)ncp)->nc_ticks;
 1652                                 if (dtsp != NULL) {
 1653                                         n3->nc_dotdottime =
 1654                                             ((struct namecache_ts *)ncp)->
 1655                                             nc_dotdottime;
 1656                                         if (ncp->nc_flag & NCF_NEGATIVE)
 1657                                                 mtx_lock(&ncneg_hot.nl_lock);
 1658                                         n3->nc_flag |= NCF_DTS;
 1659                                         if (ncp->nc_flag & NCF_NEGATIVE)
 1660                                                 mtx_unlock(&ncneg_hot.nl_lock);
 1661                                 }
 1662                         }
 1663                         goto out_unlock_free;
 1664                 }
 1665         }
 1666 
 1667         if (flag == NCF_ISDOTDOT) {
 1668                 /*
 1669                  * See if we are trying to add .. entry, but some other lookup
 1670                  * has populated v_cache_dd pointer already.
 1671                  */
 1672                 if (dvp->v_cache_dd != NULL)
 1673                         goto out_unlock_free;
 1674                 KASSERT(vp == NULL || vp->v_type == VDIR,
 1675                     ("wrong vnode type %p", vp));
 1676                 dvp->v_cache_dd = ncp;
 1677         }
 1678 
 1679         atomic_add_rel_long(&numcache, 1);
 1680         if (vp != NULL) {
 1681                 if (vp->v_type == VDIR) {
 1682                         if (flag != NCF_ISDOTDOT) {
 1683                                 /*
 1684                                  * For this case, the cache entry maps both the
 1685                                  * directory name in it and the name ".." for the
 1686                                  * directory's parent.
 1687                                  */
 1688                                 if ((ndd = vp->v_cache_dd) != NULL) {
 1689                                         if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 1690                                                 cache_zap_locked(ndd, false);
 1691                                         else
 1692                                                 ndd = NULL;
 1693                                 }
 1694                                 vp->v_cache_dd = ncp;
 1695                         }
 1696                 } else {
 1697                         vp->v_cache_dd = NULL;
 1698                 }
 1699         }
 1700 
 1701         if (flag != NCF_ISDOTDOT) {
 1702                 if (LIST_EMPTY(&dvp->v_cache_src)) {
 1703                         vhold(dvp);
 1704                         atomic_add_rel_long(&numcachehv, 1);
 1705                 }
 1706                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 1707         }
 1708 
 1709         /*
 1710          * Insert the new namecache entry into the appropriate chain
 1711          * within the cache entries table.
 1712          */
 1713         LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 1714 
 1715         /*
 1716          * If the entry is "negative", we place it into the
 1717          * "negative" cache queue, otherwise, we place it into the
 1718          * destination vnode's cache entries queue.
 1719          */
 1720         if (vp != NULL) {
 1721                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 1722                 SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
 1723                     vp);
 1724         } else {
 1725                 if (cnp->cn_flags & ISWHITEOUT)
 1726                         ncp->nc_flag |= NCF_WHITE;
 1727                 cache_negative_insert(ncp, false);
 1728                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 1729                     nc_get_name(ncp));
 1730         }
 1731         cache_enter_unlock(&cel);
 1732         if (numneg * ncnegfactor > numcache)
 1733                 cache_negative_zap_one();
 1734         cache_free(ndd);
 1735         return;
 1736 out_unlock_free:
 1737         cache_enter_unlock(&cel);
 1738         cache_free(ncp);
 1739         return;
 1740 }
 1741 
 1742 static u_int
 1743 cache_roundup_2(u_int val)
 1744 {
 1745         u_int res;
 1746 
 1747         for (res = 1; res <= val; res <<= 1)
 1748                 continue;
 1749 
 1750         return (res);
 1751 }
 1752 
 1753 /*
 1754  * Name cache initialization, from vfs_init() when we are booting
 1755  */
 1756 static void
 1757 nchinit(void *dummy __unused)
 1758 {
 1759         u_int i;
 1760 
 1761         cache_zone_small = uma_zcreate("S VFS Cache",
 1762             sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
 1763             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 1764         cache_zone_small_ts = uma_zcreate("STS VFS Cache",
 1765             sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
 1766             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 1767         cache_zone_large = uma_zcreate("L VFS Cache",
 1768             sizeof(struct namecache) + NAME_MAX + 1,
 1769             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 1770         cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
 1771             sizeof(struct namecache_ts) + NAME_MAX + 1,
 1772             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 1773 
 1774         nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 1775         ncbuckethash = cache_roundup_2(mp_ncpus * 64) - 1;
 1776         if (ncbuckethash > nchash)
 1777                 ncbuckethash = nchash;
 1778         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 1779             M_WAITOK | M_ZERO);
 1780         for (i = 0; i < numbucketlocks; i++)
 1781                 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
 1782         ncvnodehash = cache_roundup_2(mp_ncpus * 64) - 1;
 1783         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 1784             M_WAITOK | M_ZERO);
 1785         for (i = 0; i < numvnodelocks; i++)
 1786                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 1787         ncpurgeminvnodes = numbucketlocks;
 1788 
 1789         ncneghash = 3;
 1790         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
 1791             M_WAITOK | M_ZERO);
 1792         for (i = 0; i < numneglists; i++) {
 1793                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 1794                 TAILQ_INIT(&neglists[i].nl_list);
 1795         }
 1796         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
 1797         TAILQ_INIT(&ncneg_hot.nl_list);
 1798 
 1799         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
 1800 
 1801         numcalls = counter_u64_alloc(M_WAITOK);
 1802         dothits = counter_u64_alloc(M_WAITOK);
 1803         dotdothits = counter_u64_alloc(M_WAITOK);
 1804         numchecks = counter_u64_alloc(M_WAITOK);
 1805         nummiss = counter_u64_alloc(M_WAITOK);
 1806         nummisszap = counter_u64_alloc(M_WAITOK);
 1807         numposzaps = counter_u64_alloc(M_WAITOK);
 1808         numposhits = counter_u64_alloc(M_WAITOK);
 1809         numnegzaps = counter_u64_alloc(M_WAITOK);
 1810         numneghits = counter_u64_alloc(M_WAITOK);
 1811         numfullpathcalls = counter_u64_alloc(M_WAITOK);
 1812         numfullpathfail1 = counter_u64_alloc(M_WAITOK);
 1813         numfullpathfail2 = counter_u64_alloc(M_WAITOK);
 1814         numfullpathfail4 = counter_u64_alloc(M_WAITOK);
 1815         numfullpathfound = counter_u64_alloc(M_WAITOK);
 1816 }
 1817 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 1818 
 1819 void
 1820 cache_changesize(int newmaxvnodes)
 1821 {
 1822         struct nchashhead *new_nchashtbl, *old_nchashtbl;
 1823         u_long new_nchash, old_nchash;
 1824         struct namecache *ncp;
 1825         uint32_t hash;
 1826         int i;
 1827 
 1828         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 1829         if (newmaxvnodes < numbucketlocks)
 1830                 newmaxvnodes = numbucketlocks;
 1831 
 1832         new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
 1833         /* If same hash table size, nothing to do */
 1834         if (nchash == new_nchash) {
 1835                 free(new_nchashtbl, M_VFSCACHE);
 1836                 return;
 1837         }
 1838         /*
 1839          * Move everything from the old hash table to the new table.
 1840          * None of the namecache entries in the table can be removed
 1841          * because to do so, they have to be removed from the hash table.
 1842          */
 1843         cache_lock_all_vnodes();
 1844         cache_lock_all_buckets();
 1845         old_nchashtbl = nchashtbl;
 1846         old_nchash = nchash;
 1847         nchashtbl = new_nchashtbl;
 1848         nchash = new_nchash;
 1849         for (i = 0; i <= old_nchash; i++) {
 1850                 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
 1851                         hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen,
 1852                             ncp->nc_dvp);
 1853                         LIST_REMOVE(ncp, nc_hash);
 1854                         LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 1855                 }
 1856         }
 1857         cache_unlock_all_buckets();
 1858         cache_unlock_all_vnodes();
 1859         free(old_nchashtbl, M_VFSCACHE);
 1860 }
 1861 
 1862 /*
 1863  * Invalidate all entries to a particular vnode.
 1864  */
 1865 void
 1866 cache_purge(struct vnode *vp)
 1867 {
 1868         TAILQ_HEAD(, namecache) ncps;
 1869         struct namecache *ncp, *nnp;
 1870         struct mtx *vlp, *vlp2;
 1871 
 1872         CTR1(KTR_VFS, "cache_purge(%p)", vp);
 1873         SDT_PROBE1(vfs, namecache, purge, done, vp);
 1874         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 1875             vp->v_cache_dd == NULL)
 1876                 return;
 1877         TAILQ_INIT(&ncps);
 1878         vlp = VP2VNODELOCK(vp);
 1879         vlp2 = NULL;
 1880         mtx_lock(vlp);
 1881 retry:
 1882         while (!LIST_EMPTY(&vp->v_cache_src)) {
 1883                 ncp = LIST_FIRST(&vp->v_cache_src);
 1884                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 1885                         goto retry;
 1886                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 1887         }
 1888         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 1889                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
 1890                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 1891                         goto retry;
 1892                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 1893         }
 1894         ncp = vp->v_cache_dd;
 1895         if (ncp != NULL) {
 1896                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 1897                    ("lost dotdot link"));
 1898                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 1899                         goto retry;
 1900                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 1901         }
 1902         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 1903         mtx_unlock(vlp);
 1904         if (vlp2 != NULL)
 1905                 mtx_unlock(vlp2);
 1906         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 1907                 cache_free(ncp);
 1908         }
 1909 }
 1910 
 1911 /*
 1912  * Invalidate all negative entries for a particular directory vnode.
 1913  */
 1914 void
 1915 cache_purge_negative(struct vnode *vp)
 1916 {
 1917         TAILQ_HEAD(, namecache) ncps;
 1918         struct namecache *ncp, *nnp;
 1919         struct mtx *vlp;
 1920 
 1921         CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
 1922         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 1923         TAILQ_INIT(&ncps);
 1924         vlp = VP2VNODELOCK(vp);
 1925         mtx_lock(vlp);
 1926         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 1927                 if (!(ncp->nc_flag & NCF_NEGATIVE))
 1928                         continue;
 1929                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 1930                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 1931         }
 1932         mtx_unlock(vlp);
 1933         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 1934                 cache_free(ncp);
 1935         }
 1936 }
 1937 
 1938 /*
 1939  * Flush all entries referencing a particular filesystem.
 1940  */
 1941 void
 1942 cache_purgevfs(struct mount *mp, bool force)
 1943 {
 1944         TAILQ_HEAD(, namecache) ncps;
 1945         struct mtx *vlp1, *vlp2;
 1946         struct rwlock *blp;
 1947         struct nchashhead *bucket;
 1948         struct namecache *ncp, *nnp;
 1949         u_long i, j, n_nchash;
 1950         int error;
 1951 
 1952         /* Scan hash tables for applicable entries */
 1953         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 1954         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
 1955                 return;
 1956         TAILQ_INIT(&ncps);
 1957         n_nchash = nchash + 1;
 1958         vlp1 = vlp2 = NULL;
 1959         for (i = 0; i < numbucketlocks; i++) {
 1960                 blp = (struct rwlock *)&bucketlocks[i];
 1961                 rw_wlock(blp);
 1962                 for (j = i; j < n_nchash; j += numbucketlocks) {
 1963 retry:
 1964                         bucket = &nchashtbl[j];
 1965                         LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
 1966                                 cache_assert_bucket_locked(ncp, RA_WLOCKED);
 1967                                 if (ncp->nc_dvp->v_mount != mp)
 1968                                         continue;
 1969                                 error = cache_zap_wlocked_bucket_kl(ncp, blp,
 1970                                     &vlp1, &vlp2);
 1971                                 if (error != 0)
 1972                                         goto retry;
 1973                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
 1974                         }
 1975                 }
 1976                 rw_wunlock(blp);
 1977                 if (vlp1 == NULL && vlp2 == NULL)
 1978                         cache_maybe_yield();
 1979         }
 1980         if (vlp1 != NULL)
 1981                 mtx_unlock(vlp1);
 1982         if (vlp2 != NULL)
 1983                 mtx_unlock(vlp2);
 1984 
 1985         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 1986                 cache_free(ncp);
 1987         }
 1988 }
 1989 
 1990 /*
 1991  * Perform canonical checks and cache lookup and pass on to filesystem
 1992  * through the vop_cachedlookup only if needed.
 1993  */
 1994 
 1995 int
 1996 vfs_cache_lookup(struct vop_lookup_args *ap)
 1997 {
 1998         struct vnode *dvp;
 1999         int error;
 2000         struct vnode **vpp = ap->a_vpp;
 2001         struct componentname *cnp = ap->a_cnp;
 2002         struct ucred *cred = cnp->cn_cred;
 2003         int flags = cnp->cn_flags;
 2004         struct thread *td = cnp->cn_thread;
 2005 
 2006         *vpp = NULL;
 2007         dvp = ap->a_dvp;
 2008 
 2009         if (dvp->v_type != VDIR)
 2010                 return (ENOTDIR);
 2011 
 2012         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 2013             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 2014                 return (EROFS);
 2015 
 2016         error = VOP_ACCESS(dvp, VEXEC, cred, td);
 2017         if (error)
 2018                 return (error);
 2019 
 2020         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 2021         if (error == 0)
 2022                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 2023         if (error == -1)
 2024                 return (0);
 2025         return (error);
 2026 }
 2027 
 2028 /*
 2029  * XXX All of these sysctls would probably be more productive dead.
 2030  */
 2031 static int __read_mostly disablecwd;
 2032 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
 2033    "Disable the getcwd syscall");
 2034 
 2035 /* Implementation of the getcwd syscall. */
 2036 int
 2037 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 2038 {
 2039 
 2040         return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
 2041             MAXPATHLEN));
 2042 }
 2043 
 2044 int
 2045 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen,
 2046     u_int path_max)
 2047 {
 2048         char *bp, *tmpbuf;
 2049         struct filedesc *fdp;
 2050         struct vnode *cdir, *rdir;
 2051         int error;
 2052 
 2053         if (__predict_false(disablecwd))
 2054                 return (ENODEV);
 2055         if (__predict_false(buflen < 2))
 2056                 return (EINVAL);
 2057         if (buflen > path_max)
 2058                 buflen = path_max;
 2059 
 2060         tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 2061         fdp = td->td_proc->p_fd;
 2062         FILEDESC_SLOCK(fdp);
 2063         cdir = fdp->fd_cdir;
 2064         vrefact(cdir);
 2065         rdir = fdp->fd_rdir;
 2066         vrefact(rdir);
 2067         FILEDESC_SUNLOCK(fdp);
 2068         error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
 2069         vrele(rdir);
 2070         vrele(cdir);
 2071 
 2072         if (!error) {
 2073                 if (bufseg == UIO_SYSSPACE)
 2074                         bcopy(bp, buf, strlen(bp) + 1);
 2075                 else
 2076                         error = copyout(bp, buf, strlen(bp) + 1);
 2077 #ifdef KTRACE
 2078         if (KTRPOINT(curthread, KTR_NAMEI))
 2079                 ktrnamei(bp);
 2080 #endif
 2081         }
 2082         free(tmpbuf, M_TEMP);
 2083         return (error);
 2084 }
 2085 
 2086 /*
 2087  * Thus begins the fullpath magic.
 2088  */
 2089 
 2090 static int __read_mostly disablefullpath;
 2091 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
 2092     "Disable the vn_fullpath function");
 2093 
 2094 /*
 2095  * Retrieve the full filesystem path that correspond to a vnode from the name
 2096  * cache (if available)
 2097  */
 2098 int
 2099 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 2100 {
 2101         char *buf;
 2102         struct filedesc *fdp;
 2103         struct vnode *rdir;
 2104         int error;
 2105 
 2106         if (__predict_false(disablefullpath))
 2107                 return (ENODEV);
 2108         if (__predict_false(vn == NULL))
 2109                 return (EINVAL);
 2110 
 2111         buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 2112         fdp = td->td_proc->p_fd;
 2113         FILEDESC_SLOCK(fdp);
 2114         rdir = fdp->fd_rdir;
 2115         vrefact(rdir);
 2116         FILEDESC_SUNLOCK(fdp);
 2117         error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
 2118         vrele(rdir);
 2119 
 2120         if (!error)
 2121                 *freebuf = buf;
 2122         else
 2123                 free(buf, M_TEMP);
 2124         return (error);
 2125 }
 2126 
 2127 /*
 2128  * This function is similar to vn_fullpath, but it attempts to lookup the
 2129  * pathname relative to the global root mount point.  This is required for the
 2130  * auditing sub-system, as audited pathnames must be absolute, relative to the
 2131  * global root mount point.
 2132  */
 2133 int
 2134 vn_fullpath_global(struct thread *td, struct vnode *vn,
 2135     char **retbuf, char **freebuf)
 2136 {
 2137         char *buf;
 2138         int error;
 2139 
 2140         if (__predict_false(disablefullpath))
 2141                 return (ENODEV);
 2142         if (__predict_false(vn == NULL))
 2143                 return (EINVAL);
 2144         buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 2145         error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
 2146         if (!error)
 2147                 *freebuf = buf;
 2148         else
 2149                 free(buf, M_TEMP);
 2150         return (error);
 2151 }
 2152 
 2153 int
 2154 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
 2155 {
 2156         struct vnode *dvp;
 2157         struct namecache *ncp;
 2158         struct mtx *vlp;
 2159         int error;
 2160 
 2161         vlp = VP2VNODELOCK(*vp);
 2162         mtx_lock(vlp);
 2163         TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 2164                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 2165                         break;
 2166         }
 2167         if (ncp != NULL) {
 2168                 if (*buflen < ncp->nc_nlen) {
 2169                         mtx_unlock(vlp);
 2170                         vrele(*vp);
 2171                         counter_u64_add(numfullpathfail4, 1);
 2172                         error = ENOMEM;
 2173                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 2174                             vp, NULL);
 2175                         return (error);
 2176                 }
 2177                 *buflen -= ncp->nc_nlen;
 2178                 memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
 2179                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 2180                     nc_get_name(ncp), vp);
 2181                 dvp = *vp;
 2182                 *vp = ncp->nc_dvp;
 2183                 vref(*vp);
 2184                 mtx_unlock(vlp);
 2185                 vrele(dvp);
 2186                 return (0);
 2187         }
 2188         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 2189 
 2190         mtx_unlock(vlp);
 2191         vn_lock(*vp, LK_SHARED | LK_RETRY);
 2192         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 2193         vput(*vp);
 2194         if (error) {
 2195                 counter_u64_add(numfullpathfail2, 1);
 2196                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 2197                 return (error);
 2198         }
 2199 
 2200         *vp = dvp;
 2201         if (dvp->v_iflag & VI_DOOMED) {
 2202                 /* forced unmount */
 2203                 vrele(dvp);
 2204                 error = ENOENT;
 2205                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 2206                 return (error);
 2207         }
 2208         /*
 2209          * *vp has its use count incremented still.
 2210          */
 2211 
 2212         return (0);
 2213 }
 2214 
 2215 /*
 2216  * The magic behind kern___getcwd() and vn_fullpath().
 2217  */
 2218 static int
 2219 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 2220     char *buf, char **retbuf, u_int buflen)
 2221 {
 2222         int error, slash_prefixed;
 2223 #ifdef KDTRACE_HOOKS
 2224         struct vnode *startvp = vp;
 2225 #endif
 2226         struct vnode *vp1;
 2227 
 2228         buflen--;
 2229         buf[buflen] = '\0';
 2230         error = 0;
 2231         slash_prefixed = 0;
 2232 
 2233         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 2234         counter_u64_add(numfullpathcalls, 1);
 2235         vref(vp);
 2236         if (vp->v_type != VDIR) {
 2237                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 2238                 if (error)
 2239                         return (error);
 2240                 if (buflen == 0) {
 2241                         vrele(vp);
 2242                         return (ENOMEM);
 2243                 }
 2244                 buf[--buflen] = '/';
 2245                 slash_prefixed = 1;
 2246         }
 2247         while (vp != rdir && vp != rootvnode) {
 2248                 /*
 2249                  * The vp vnode must be already fully constructed,
 2250                  * since it is either found in namecache or obtained
 2251                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 2252                  * without obtaining the vnode lock.
 2253                  */
 2254                 if ((vp->v_vflag & VV_ROOT) != 0) {
 2255                         vn_lock(vp, LK_RETRY | LK_SHARED);
 2256 
 2257                         /*
 2258                          * With the vnode locked, check for races with
 2259                          * unmount, forced or not.  Note that we
 2260                          * already verified that vp is not equal to
 2261                          * the root vnode, which means that
 2262                          * mnt_vnodecovered can be NULL only for the
 2263                          * case of unmount.
 2264                          */
 2265                         if ((vp->v_iflag & VI_DOOMED) != 0 ||
 2266                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 2267                             vp1->v_mountedhere != vp->v_mount) {
 2268                                 vput(vp);
 2269                                 error = ENOENT;
 2270                                 SDT_PROBE3(vfs, namecache, fullpath, return,
 2271                                     error, vp, NULL);
 2272                                 break;
 2273                         }
 2274 
 2275                         vref(vp1);
 2276                         vput(vp);
 2277                         vp = vp1;
 2278                         continue;
 2279                 }
 2280                 if (vp->v_type != VDIR) {
 2281                         vrele(vp);
 2282                         counter_u64_add(numfullpathfail1, 1);
 2283                         error = ENOTDIR;
 2284                         SDT_PROBE3(vfs, namecache, fullpath, return,
 2285                             error, vp, NULL);
 2286                         break;
 2287                 }
 2288                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 2289                 if (error)
 2290                         break;
 2291                 if (buflen == 0) {
 2292                         vrele(vp);
 2293                         error = ENOMEM;
 2294                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
 2295                             startvp, NULL);
 2296                         break;
 2297                 }
 2298                 buf[--buflen] = '/';
 2299                 slash_prefixed = 1;
 2300         }
 2301         if (error)
 2302                 return (error);
 2303         if (!slash_prefixed) {
 2304                 if (buflen == 0) {
 2305                         vrele(vp);
 2306                         counter_u64_add(numfullpathfail4, 1);
 2307                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 2308                             startvp, NULL);
 2309                         return (ENOMEM);
 2310                 }
 2311                 buf[--buflen] = '/';
 2312         }
 2313         counter_u64_add(numfullpathfound, 1);
 2314         vrele(vp);
 2315 
 2316         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
 2317         *retbuf = buf + buflen;
 2318         return (0);
 2319 }
 2320 
 2321 struct vnode *
 2322 vn_dir_dd_ino(struct vnode *vp)
 2323 {
 2324         struct namecache *ncp;
 2325         struct vnode *ddvp;
 2326         struct mtx *vlp;
 2327 
 2328         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 2329         vlp = VP2VNODELOCK(vp);
 2330         mtx_lock(vlp);
 2331         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 2332                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 2333                         continue;
 2334                 ddvp = ncp->nc_dvp;
 2335                 vhold(ddvp);
 2336                 mtx_unlock(vlp);
 2337                 if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
 2338                         return (NULL);
 2339                 return (ddvp);
 2340         }
 2341         mtx_unlock(vlp);
 2342         return (NULL);
 2343 }
 2344 
 2345 int
 2346 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 2347 {
 2348         struct namecache *ncp;
 2349         struct mtx *vlp;
 2350         int l;
 2351 
 2352         vlp = VP2VNODELOCK(vp);
 2353         mtx_lock(vlp);
 2354         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 2355                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 2356                         break;
 2357         if (ncp == NULL) {
 2358                 mtx_unlock(vlp);
 2359                 return (ENOENT);
 2360         }
 2361         l = min(ncp->nc_nlen, buflen - 1);
 2362         memcpy(buf, nc_get_name(ncp), l);
 2363         mtx_unlock(vlp);
 2364         buf[l] = '\0';
 2365         return (0);
 2366 }
 2367 
 2368 /* ABI compat shims for old kernel modules. */
 2369 #undef cache_enter
 2370 
 2371 void    cache_enter(struct vnode *dvp, struct vnode *vp,
 2372             struct componentname *cnp);
 2373 
 2374 void
 2375 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 2376 {
 2377 
 2378         cache_enter_time(dvp, vp, cnp, NULL, NULL);
 2379 }
 2380 
 2381 /*
 2382  * This function updates path string to vnode's full global path
 2383  * and checks the size of the new path string against the pathlen argument.
 2384  *
 2385  * Requires a locked, referenced vnode.
 2386  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
 2387  *
 2388  * If sysctl debug.disablefullpath is set, ENODEV is returned,
 2389  * vnode is left locked and path remain untouched.
 2390  *
 2391  * If vp is a directory, the call to vn_fullpath_global() always succeeds
 2392  * because it falls back to the ".." lookup if the namecache lookup fails.
 2393  */
 2394 int
 2395 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
 2396     u_int pathlen)
 2397 {
 2398         struct nameidata nd;
 2399         struct vnode *vp1;
 2400         char *rpath, *fbuf;
 2401         int error;
 2402 
 2403         ASSERT_VOP_ELOCKED(vp, __func__);
 2404 
 2405         /* Return ENODEV if sysctl debug.disablefullpath==1 */
 2406         if (__predict_false(disablefullpath))
 2407                 return (ENODEV);
 2408 
 2409         /* Construct global filesystem path from vp. */
 2410         VOP_UNLOCK(vp, 0);
 2411         error = vn_fullpath_global(td, vp, &rpath, &fbuf);
 2412 
 2413         if (error != 0) {
 2414                 vrele(vp);
 2415                 return (error);
 2416         }
 2417 
 2418         if (strlen(rpath) >= pathlen) {
 2419                 vrele(vp);
 2420                 error = ENAMETOOLONG;
 2421                 goto out;
 2422         }
 2423 
 2424         /*
 2425          * Re-lookup the vnode by path to detect a possible rename.
 2426          * As a side effect, the vnode is relocked.
 2427          * If vnode was renamed, return ENOENT.
 2428          */
 2429         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 2430             UIO_SYSSPACE, path, td);
 2431         error = namei(&nd);
 2432         if (error != 0) {
 2433                 vrele(vp);
 2434                 goto out;
 2435         }
 2436         NDFREE(&nd, NDF_ONLY_PNBUF);
 2437         vp1 = nd.ni_vp;
 2438         vrele(vp);
 2439         if (vp1 == vp)
 2440                 strcpy(path, rpath);
 2441         else {
 2442                 vput(vp1);
 2443                 error = ENOENT;
 2444         }
 2445 
 2446 out:
 2447         free(fbuf, M_TEMP);
 2448         return (error);
 2449 }

Cache object: 03ba2c83eded711d21b8508e149b4cc1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.