vfs_cache.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: vfs_cache.c,v 1.152 2021/11/01 21:28:03 andvar Exp $   */
    2 
    3 /*-
    4  * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  * Copyright (c) 1989, 1993
   34  *      The Regents of the University of California.  All rights reserved.
   35  *
   36  * Redistribution and use in source and binary forms, with or without
   37  * modification, are permitted provided that the following conditions
   38  * are met:
   39  * 1. Redistributions of source code must retain the above copyright
   40  *    notice, this list of conditions and the following disclaimer.
   41  * 2. Redistributions in binary form must reproduce the above copyright
   42  *    notice, this list of conditions and the following disclaimer in the
   43  *    documentation and/or other materials provided with the distribution.
   44  * 3. Neither the name of the University nor the names of its contributors
   45  *    may be used to endorse or promote products derived from this software
   46  *    without specific prior written permission.
   47  *
   48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   58  * SUCH DAMAGE.
   59  *
   60  *      @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
   61  */
   62 
   63 /*
   64  * Name caching:
   65  *
   66  *      Names found by directory scans are retained in a cache for future
   67  *      reference.  It is managed LRU, so frequently used names will hang
   68  *      around.  The cache is indexed by hash value obtained from the name.
   69  *
   70  *      The name cache is the brainchild of Robert Elz and was introduced in
   71  *      4.3BSD.  See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
   72  *      McKusick, May 21 1984.
   73  *
   74  * Data structures:
   75  *
   76  *      Most Unix namecaches very sensibly use a global hash table to index
   77  *      names.  The global hash table works well, but can cause concurrency
   78  *      headaches for the kernel hacker.  In the NetBSD 10.0 implementation
   79  *      we are not sensible, and use a per-directory data structure to index
   80  *      names, but the cache otherwise functions the same.
   81  *
   82  *      The index is a red-black tree.  There are no special concurrency
   83  *      requirements placed on it, because it's per-directory and protected
   84  *      by the namecache's per-directory locks.  It should therefore not be
   85  *      difficult to experiment with other types of index.
   86  *
   87  *      Each cached name is stored in a struct namecache, along with a
   88  *      pointer to the associated vnode (nc_vp).  Names longer than a
   89  *      maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
   90  *      occur infrequently, and names shorter than this are stored directly
   91  *      in struct namecache.  If it is a "negative" entry, (i.e. for a name
   92  *      that is known NOT to exist) the vnode pointer will be NULL.
   93  *
   94  *      For a directory with 3 cached names for 3 distinct vnodes, the
   95  *      various vnodes and namecache structs would be connected like this
   96  *      (the root is at the bottom of the diagram):
   97  *
   98  *          ...
   99  *           ^
  100  *           |- vi_nc_tree
  101  *           |
  102  *      +----o----+               +---------+               +---------+
  103  *      |  VDIR   |               |  VCHR   |               |  VREG   |
  104  *      |  vnode  o-----+         |  vnode  o-----+         |  vnode  o------+
  105  *      +---------+     |         +---------+     |         +---------+      |
  106  *           ^          |              ^          |              ^           |
  107  *           |- nc_vp   |- vi_nc_list  |- nc_vp   |- vi_nc_list  |- nc_vp    |
  108  *           |          |              |          |              |           |
  109  *      +----o----+     |         +----o----+     |         +----o----+      |
  110  *  +---onamecache|<----+     +---onamecache|<----+     +---onamecache|<-----+
  111  *  |   +---------+           |   +---------+           |   +---------+
  112  *  |        ^                |        ^                |        ^
  113  *  |        |                |        |                |        |
  114  *  |        |  +----------------------+                |        |
  115  *  |-nc_dvp | +-------------------------------------------------+
  116  *  |        |/- vi_nc_tree   |                         |
  117  *  |        |                |- nc_dvp                 |- nc_dvp
  118  *  |   +----o----+           |                         |
  119  *  +-->|  VDIR   |<----------+                         |
  120  *      |  vnode  |<------------------------------------+
  121  *      +---------+
  122  *
  123  *      START HERE
  124  *
  125  * Replacement:
  126  *
  127  *      As the cache becomes full, old and unused entries are purged as new
  128  *      entries are added.  The synchronization overhead in maintaining a
  129  *      strict ordering would be prohibitive, so the VM system's "clock" or
  130  *      "second chance" page replacement algorithm is aped here.  New
  131  *      entries go to the tail of the active list.  After they age out and
  132  *      reach the head of the list, they are moved to the tail of the
  133  *      inactive list.  Any use of the deactivated cache entry reactivates
  134  *      it, saving it from impending doom; if not reactivated, the entry
  135  *      eventually reaches the head of the inactive list and is purged.
  136  *
  137  * Concurrency:
  138  *
  139  *      From a performance perspective, cache_lookup(nameiop == LOOKUP) is
  140  *      what really matters; insertion of new entries with cache_enter() is
  141  *      comparatively infrequent, and overshadowed by the cost of expensive
  142  *      file system metadata operations (which may involve disk I/O).  We
  143  *      therefore want to make everything simplest in the lookup path.
  144  *
  145  *      struct namecache is mostly stable except for list and tree related
  146  *      entries, changes to which don't affect the cached name or vnode. 
  147  *      For changes to name+vnode, entries are purged in preference to
  148  *      modifying them.
  149  *
  150  *      Read access to namecache entries is made via tree, list, or LRU
  151  *      list.  A lock corresponding to the direction of access should be
  152  *      held.  See definition of "struct namecache" in src/sys/namei.src,
  153  *      and the definition of "struct vnode" for the particulars.
  154  *
  155  *      Per-CPU statistics, and LRU list totals are read unlocked, since
  156  *      an approximate value is OK.  We maintain 32-bit sized per-CPU
  157  *      counters and 64-bit global counters under the theory that 32-bit
  158  *      sized counters are less likely to be hosed by nonatomic increment
  159  *      (on 32-bit platforms).
  160  *
  161  *      The lock order is:
  162  *
  163  *      1) vi->vi_nc_lock       (tree or parent -> child direction,
  164  *                               used during forward lookup)
  165  *
  166  *      2) vi->vi_nc_listlock   (list or child -> parent direction,
  167  *                               used during reverse lookup)
  168  *
  169  *      3) cache_lru_lock       (LRU list direction, used during reclaim)
  170  *
  171  *      4) vp->v_interlock      (what the cache entry points to)
  172  */
  173 
  174 #include <sys/cdefs.h>
  175 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.152 2021/11/01 21:28:03 andvar Exp $");
  176 
  177 #define __NAMECACHE_PRIVATE
  178 #ifdef _KERNEL_OPT
  179 #include "opt_ddb.h"
  180 #include "opt_dtrace.h"
  181 #endif
  182 
  183 #include <sys/param.h>
  184 #include <sys/types.h>
  185 #include <sys/atomic.h>
  186 #include <sys/callout.h>
  187 #include <sys/cpu.h>
  188 #include <sys/errno.h>
  189 #include <sys/evcnt.h>
  190 #include <sys/hash.h>
  191 #include <sys/kernel.h>
  192 #include <sys/mount.h>
  193 #include <sys/mutex.h>
  194 #include <sys/namei.h>
  195 #include <sys/param.h>
  196 #include <sys/pool.h>
  197 #include <sys/sdt.h>
  198 #include <sys/sysctl.h>
  199 #include <sys/systm.h>
  200 #include <sys/time.h>
  201 #include <sys/vnode_impl.h>
  202 
  203 #include <miscfs/genfs/genfs.h>
  204 
  205 static void     cache_activate(struct namecache *);
  206 static void     cache_update_stats(void *);
  207 static int      cache_compare_nodes(void *, const void *, const void *);
  208 static void     cache_deactivate(void);
  209 static void     cache_reclaim(void);
  210 static int      cache_stat_sysctl(SYSCTLFN_ARGS);
  211 
  212 /*
  213  * Global pool cache.
  214  */
  215 static pool_cache_t cache_pool __read_mostly;
  216 
  217 /*
  218  * LRU replacement.
  219  */
  220 enum cache_lru_id {
  221         LRU_ACTIVE,
  222         LRU_INACTIVE,
  223         LRU_COUNT
  224 };
  225 
  226 static struct {
  227         TAILQ_HEAD(, namecache) list[LRU_COUNT];
  228         u_int                   count[LRU_COUNT];
  229 } cache_lru __cacheline_aligned;
  230 
  231 static kmutex_t cache_lru_lock __cacheline_aligned;
  232 
  233 /*
  234  * Cache effectiveness statistics.  nchstats holds system-wide total.
  235  */
  236 struct nchstats nchstats;
  237 struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
  238 struct nchcpu {
  239         struct nchstats_percpu cur;
  240         struct nchstats_percpu last;
  241 };
  242 static callout_t cache_stat_callout;
  243 static kmutex_t cache_stat_lock __cacheline_aligned;
  244 
  245 #define COUNT(f) do { \
  246         lwp_t *l = curlwp; \
  247         KPREEMPT_DISABLE(l); \
  248         struct nchcpu *nchcpu = curcpu()->ci_data.cpu_nch; \
  249         nchcpu->cur.f++; \
  250         KPREEMPT_ENABLE(l); \
  251 } while (/* CONSTCOND */ 0);
  252 
  253 #define UPDATE(nchcpu, f) do { \
  254         uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \
  255         nchstats.f += (uint32_t)(cur - nchcpu->last.f); \
  256         nchcpu->last.f = cur; \
  257 } while (/* CONSTCOND */ 0)
  258 
  259 /*
  260  * Tunables.  cache_maxlen replaces the historical doingcache:
  261  * set it zero to disable caching for debugging purposes.
  262  */
  263 int cache_lru_maxdeact __read_mostly = 2;       /* max # to deactivate */
  264 int cache_lru_maxscan __read_mostly = 64;       /* max # to scan/reclaim */
  265 int cache_maxlen __read_mostly = USHRT_MAX;     /* max name length to cache */
  266 int cache_stat_interval __read_mostly = 300;    /* in seconds */
  267 
  268 /*
  269  * sysctl stuff.
  270  */
  271 static struct   sysctllog *cache_sysctllog;
  272 
  273 /*
  274  * This is a dummy name that cannot usually occur anywhere in the cache nor
  275  * file system.  It's used when caching the root vnode of mounted file
  276  * systems.  The name is attached to the directory that the file system is
  277  * mounted on.
  278  */
  279 static const char cache_mp_name[] = "";
  280 static const int cache_mp_nlen = sizeof(cache_mp_name) - 1;
  281 
  282 /*
  283  * Red-black tree stuff.
  284  */
  285 static const rb_tree_ops_t cache_rbtree_ops = {
  286         .rbto_compare_nodes = cache_compare_nodes,
  287         .rbto_compare_key = cache_compare_nodes,
  288         .rbto_node_offset = offsetof(struct namecache, nc_tree),
  289         .rbto_context = NULL
  290 };
  291 
  292 /*
  293  * dtrace probes.
  294  */
  295 SDT_PROVIDER_DEFINE(vfs);
  296 
  297 SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
  298 SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
  299 SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
  300 SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
  301 SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
  302 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
  303     "char *", "size_t");
  304 SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
  305     "char *", "size_t");
  306 SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
  307     "char *", "size_t");
  308 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
  309      "struct vnode *");
  310 SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
  311      "int");
  312 SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
  313 SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
  314     "char *", "size_t");
  315 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
  316     "char *", "size_t");
  317 
  318 /*
  319  * rbtree: compare two nodes.
  320  */
  321 static int
  322 cache_compare_nodes(void *context, const void *n1, const void *n2)
  323 {
  324         const struct namecache *nc1 = n1;
  325         const struct namecache *nc2 = n2;
  326 
  327         if (nc1->nc_key < nc2->nc_key) {
  328                 return -1;
  329         }
  330         if (nc1->nc_key > nc2->nc_key) {
  331                 return 1;
  332         }
  333         KASSERT(nc1->nc_nlen == nc2->nc_nlen);
  334         return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen);
  335 }
  336 
  337 /*
  338  * Compute a key value for the given name.  The name length is encoded in
  339  * the key value to try and improve uniqueness, and so that length doesn't
  340  * need to be compared separately for string comparisons.
  341  */
  342 static inline uint64_t
  343 cache_key(const char *name, size_t nlen)
  344 {
  345         uint64_t key;
  346 
  347         KASSERT(nlen <= USHRT_MAX);
  348 
  349         key = hash32_buf(name, nlen, HASH32_STR_INIT);
  350         return (key << 32) | nlen;
  351 }
  352 
  353 /*
  354  * Remove an entry from the cache.  vi_nc_lock must be held, and if dir2node
  355  * is true, then we're locking in the conventional direction and the list
  356  * lock will be acquired when removing the entry from the vnode list.
  357  */
  358 static void
  359 cache_remove(struct namecache *ncp, const bool dir2node)
  360 {
  361         struct vnode *vp, *dvp = ncp->nc_dvp;
  362         vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
  363 
  364         KASSERT(rw_write_held(&dvi->vi_nc_lock));
  365         KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key);
  366         KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp);
  367 
  368         SDT_PROBE(vfs, namecache, invalidate, done, ncp,
  369             0, 0, 0, 0);
  370 
  371         /*
  372          * Remove from the vnode's list.  This excludes cache_revlookup(),
  373          * and then it's safe to remove from the LRU lists.
  374          */
  375         if ((vp = ncp->nc_vp) != NULL) {
  376                 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
  377                 if (__predict_true(dir2node)) {
  378                         rw_enter(&vi->vi_nc_listlock, RW_WRITER);
  379                         TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
  380                         rw_exit(&vi->vi_nc_listlock);
  381                 } else {
  382                         TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
  383                 }
  384         }
  385 
  386         /* Remove from the directory's rbtree. */
  387         rb_tree_remove_node(&dvi->vi_nc_tree, ncp);
  388 
  389         /* Remove from the LRU lists. */
  390         mutex_enter(&cache_lru_lock);
  391         TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
  392         cache_lru.count[ncp->nc_lrulist]--;
  393         mutex_exit(&cache_lru_lock);
  394 
  395         /* Finally, free it. */
  396         if (ncp->nc_nlen > NCHNAMLEN) {
  397                 size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
  398                 kmem_free(ncp, sz);
  399         } else {
  400                 pool_cache_put(cache_pool, ncp);
  401         }
  402 }
  403 
  404 /*
  405  * Find a single cache entry and return it.  vi_nc_lock must be held.
  406  */
  407 static struct namecache * __noinline
  408 cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
  409     uint64_t key)
  410 {
  411         vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
  412         struct rb_node *node = dvi->vi_nc_tree.rbt_root;
  413         struct namecache *ncp;
  414         int lrulist, diff;
  415 
  416         KASSERT(rw_lock_held(&dvi->vi_nc_lock));
  417 
  418         /*
  419          * Search the RB tree for the key.  This is an inlined lookup
  420          * tailored for exactly what's needed here (64-bit key and so on)
  421          * that is quite a bit faster than using rb_tree_find_node().
  422          *
  423          * For a matching key memcmp() needs to be called once to confirm
  424          * that the correct name has been found.  Very rarely there will be
  425          * a key value collision and the search will continue.
  426          */
  427         for (;;) {
  428                 if (__predict_false(RB_SENTINEL_P(node))) {
  429                         return NULL;
  430                 }
  431                 ncp = (struct namecache *)node;
  432                 KASSERT((void *)&ncp->nc_tree == (void *)ncp);
  433                 KASSERT(ncp->nc_dvp == dvp);
  434                 if (ncp->nc_key == key) {
  435                         KASSERT(ncp->nc_nlen == namelen);
  436                         diff = memcmp(ncp->nc_name, name, namelen);
  437                         if (__predict_true(diff == 0)) {
  438                                 break;
  439                         }
  440                         node = node->rb_nodes[diff < 0];
  441                 } else {
  442                         node = node->rb_nodes[ncp->nc_key < key];
  443                 }
  444         }
  445 
  446         /*
  447          * If the entry is on the wrong LRU list, requeue it.  This is an
  448          * unlocked check, but it will rarely be wrong and even then there
  449          * will be no harm caused.
  450          */
  451         lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
  452         if (__predict_false(lrulist != LRU_ACTIVE)) {
  453                 cache_activate(ncp);
  454         }
  455         return ncp;
  456 }
  457 
  458 /*
  459  * Look for a the name in the cache. We don't do this
  460  * if the segment name is long, simply so the cache can avoid
  461  * holding long names (which would either waste space, or
  462  * add greatly to the complexity).
  463  *
  464  * Lookup is called with DVP pointing to the directory to search,
  465  * and CNP providing the name of the entry being sought: cn_nameptr
  466  * is the name, cn_namelen is its length, and cn_flags is the flags
  467  * word from the namei operation.
  468  *
  469  * DVP must be locked.
  470  *
  471  * There are three possible non-error return states:
  472  *    1. Nothing was found in the cache. Nothing is known about
  473  *       the requested name.
  474  *    2. A negative entry was found in the cache, meaning that the
  475  *       requested name definitely does not exist.
  476  *    3. A positive entry was found in the cache, meaning that the
  477  *       requested name does exist and that we are providing the
  478  *       vnode.
  479  * In these cases the results are:
  480  *    1. 0 returned; VN is set to NULL.
  481  *    2. 1 returned; VN is set to NULL.
  482  *    3. 1 returned; VN is set to the vnode found.
  483  *
  484  * The additional result argument ISWHT is set to zero, unless a
  485  * negative entry is found that was entered as a whiteout, in which
  486  * case ISWHT is set to one.
  487  *
  488  * The ISWHT_RET argument pointer may be null. In this case an
  489  * assertion is made that the whiteout flag is not set. File systems
  490  * that do not support whiteouts can/should do this.
  491  *
  492  * Filesystems that do support whiteouts should add ISWHITEOUT to
  493  * cnp->cn_flags if ISWHT comes back nonzero.
  494  *
  495  * When a vnode is returned, it is locked, as per the vnode lookup
  496  * locking protocol.
  497  *
  498  * There is no way for this function to fail, in the sense of
  499  * generating an error that requires aborting the namei operation.
  500  *
  501  * (Prior to October 2012, this function returned an integer status,
  502  * and a vnode, and mucked with the flags word in CNP for whiteouts.
  503  * The integer status was -1 for "nothing found", ENOENT for "a
  504  * negative entry found", 0 for "a positive entry found", and possibly
  505  * other errors, and the value of VN might or might not have been set
  506  * depending on what error occurred.)
  507  */
  508 bool
  509 cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
  510              uint32_t nameiop, uint32_t cnflags,
  511              int *iswht_ret, struct vnode **vn_ret)
  512 {
  513         vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
  514         struct namecache *ncp;
  515         struct vnode *vp;
  516         uint64_t key;
  517         int error;
  518         bool hit;
  519         krw_t op;
  520 
  521         KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
  522 
  523         /* Establish default result values */
  524         if (iswht_ret != NULL) {
  525                 *iswht_ret = 0;
  526         }
  527         *vn_ret = NULL;
  528 
  529         if (__predict_false(namelen > cache_maxlen)) {
  530                 SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
  531                     name, namelen, 0, 0);
  532                 COUNT(ncs_long);
  533                 return false;
  534         }
  535 
  536         /* Compute the key up front - don't need the lock. */
  537         key = cache_key(name, namelen);
  538 
  539         /* Could the entry be purged below? */
  540         if ((cnflags & ISLASTCN) != 0 &&
  541             ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) {
  542                 op = RW_WRITER;
  543         } else {
  544                 op = RW_READER;
  545         }
  546 
  547         /* Now look for the name. */
  548         rw_enter(&dvi->vi_nc_lock, op);
  549         ncp = cache_lookup_entry(dvp, name, namelen, key);
  550         if (__predict_false(ncp == NULL)) {
  551                 rw_exit(&dvi->vi_nc_lock);
  552                 COUNT(ncs_miss);
  553                 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
  554                     name, namelen, 0, 0);
  555                 return false;
  556         }
  557         if (__predict_false((cnflags & MAKEENTRY) == 0)) {
  558                 /*
  559                  * Last component and we are renaming or deleting,
  560                  * the cache entry is invalid, or otherwise don't
  561                  * want cache entry to exist.
  562                  */
  563                 KASSERT((cnflags & ISLASTCN) != 0);
  564                 cache_remove(ncp, true);
  565                 rw_exit(&dvi->vi_nc_lock);
  566                 COUNT(ncs_badhits);
  567                 return false;
  568         }
  569         if (ncp->nc_vp == NULL) {
  570                 if (iswht_ret != NULL) {
  571                         /*
  572                          * Restore the ISWHITEOUT flag saved earlier.
  573                          */
  574                         *iswht_ret = ncp->nc_whiteout;
  575                 } else {
  576                         KASSERT(!ncp->nc_whiteout);
  577                 }
  578                 if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
  579                         /*
  580                          * Last component and we are preparing to create
  581                          * the named object, so flush the negative cache
  582                          * entry.
  583                          */
  584                         COUNT(ncs_badhits);
  585                         cache_remove(ncp, true);
  586                         hit = false;
  587                 } else {
  588                         COUNT(ncs_neghits);
  589                         SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
  590                             namelen, 0, 0);
  591                         /* found neg entry; vn is already null from above */
  592                         hit = true;
  593                 }
  594                 rw_exit(&dvi->vi_nc_lock);
  595                 return hit;
  596         }
  597         vp = ncp->nc_vp;
  598         error = vcache_tryvget(vp);
  599         rw_exit(&dvi->vi_nc_lock);
  600         if (error) {
  601                 KASSERT(error == EBUSY);
  602                 /*
  603                  * This vnode is being cleaned out.
  604                  * XXX badhits?
  605                  */
  606                 COUNT(ncs_falsehits);
  607                 return false;
  608         }
  609 
  610         COUNT(ncs_goodhits);
  611         SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
  612         /* found it */
  613         *vn_ret = vp;
  614         return true;
  615 }
  616 
  617 /*
  618  * Version of the above without the nameiop argument, for NFS.
  619  */
  620 bool
  621 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
  622                  uint32_t cnflags,
  623                  int *iswht_ret, struct vnode **vn_ret)
  624 {
  625 
  626         return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
  627             iswht_ret, vn_ret);
  628 }
  629 
  630 /*
  631  * Used by namei() to walk down a path, component by component by looking up
  632  * names in the cache.  The node locks are chained along the way: a parent's
  633  * lock is not dropped until the child's is acquired.
  634  */
  635 bool
  636 cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
  637                     struct vnode **vn_ret, krwlock_t **plock,
  638                     kauth_cred_t cred)
  639 {
  640         vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
  641         struct namecache *ncp;
  642         krwlock_t *oldlock, *newlock;
  643         uint64_t key;
  644         int error;
  645 
  646         KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
  647 
  648         /* If disabled, or file system doesn't support this, bail out. */
  649         if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) {
  650                 return false;
  651         }
  652 
  653         if (__predict_false(namelen > cache_maxlen)) {
  654                 COUNT(ncs_long);
  655                 return false;
  656         }
  657 
  658         /* Compute the key up front - don't need the lock. */
  659         key = cache_key(name, namelen);
  660 
  661         /*
  662          * Acquire the directory lock.  Once we have that, we can drop the
  663          * previous one (if any).
  664          *
  665          * The two lock holds mean that the directory can't go away while
  666          * here: the directory must be purged with cache_purge() before
  667          * being freed, and both parent & child's vi_nc_lock must be taken
  668          * before that point is passed.
  669          *
  670          * However if there's no previous lock, like at the root of the
  671          * chain, then "dvp" must be referenced to prevent dvp going away
  672          * before we get its lock.
  673          *
  674          * Note that the two locks can be the same if looking up a dot, for
  675          * example: /usr/bin/.  If looking up the parent (..) we can't wait
  676          * on the lock as child -> parent is the wrong direction.
  677          */
  678         if (*plock != &dvi->vi_nc_lock) {
  679                 oldlock = *plock;
  680                 newlock = &dvi->vi_nc_lock;
  681                 if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) {
  682                         return false;
  683                 }
  684         } else {
  685                 oldlock = NULL;
  686                 newlock = NULL;
  687                 if (*plock == NULL) {
  688                         KASSERT(vrefcnt(dvp) > 0);
  689                 }
  690         }
  691 
  692         /*
  693          * First up check if the user is allowed to look up files in this
  694          * directory.
  695          */
  696         if (cred != FSCRED) {
  697                 if (dvi->vi_nc_mode == VNOVAL) {
  698                         if (newlock != NULL) {
  699                                 rw_exit(newlock);
  700                         }
  701                         return false;
  702                 }
  703                 KASSERT(dvi->vi_nc_uid != VNOVAL && dvi->vi_nc_gid != VNOVAL);
  704                 error = kauth_authorize_vnode(cred,
  705                     KAUTH_ACCESS_ACTION(VEXEC,
  706                     dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL,
  707                     genfs_can_access(dvp, cred, dvi->vi_nc_uid, dvi->vi_nc_gid,
  708                     dvi->vi_nc_mode & ALLPERMS, NULL, VEXEC));
  709                 if (error != 0) {
  710                         if (newlock != NULL) {
  711                                 rw_exit(newlock);
  712                         }
  713                         COUNT(ncs_denied);
  714                         return false;
  715                 }
  716         }
  717 
  718         /*
  719          * Now look for a matching cache entry.
  720          */
  721         ncp = cache_lookup_entry(dvp, name, namelen, key);
  722         if (__predict_false(ncp == NULL)) {
  723                 if (newlock != NULL) {
  724                         rw_exit(newlock);
  725                 }
  726                 COUNT(ncs_miss);
  727                 SDT_PROBE(vfs, namecache, lookup, miss, dvp,
  728                     name, namelen, 0, 0);
  729                 return false;
  730         }
  731         if (ncp->nc_vp == NULL) {
  732                 /* found negative entry; vn is already null from above */
  733                 KASSERT(namelen != cache_mp_nlen && name != cache_mp_name);
  734                 COUNT(ncs_neghits);
  735         } else {
  736                 COUNT(ncs_goodhits); /* XXX can be "badhits" */
  737         }
  738         SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
  739 
  740         /*
  741          * Return with the directory lock still held.  It will either be
  742          * returned to us with another call to cache_lookup_linked() when
  743          * looking up the next component, or the caller will release it
  744          * manually when finished.
  745          */
  746         if (oldlock) {
  747                 rw_exit(oldlock);
  748         }
  749         if (newlock) {
  750                 *plock = newlock;
  751         }
  752         *vn_ret = ncp->nc_vp;
  753         return true;
  754 }
  755 
  756 /*
  757  * Scan cache looking for name of directory entry pointing at vp.
  758  * Will not search for "." or "..".
  759  *
  760  * If the lookup succeeds the vnode is referenced and stored in dvpp.
  761  *
  762  * If bufp is non-NULL, also place the name in the buffer which starts
  763  * at bufp, immediately before *bpp, and move bpp backwards to point
  764  * at the start of it.  (Yes, this is a little baroque, but it's done
  765  * this way to cater to the whims of getcwd).
  766  *
  767  * Returns 0 on success, -1 on cache miss, positive errno on failure.
  768  */
  769 int
  770 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
  771     bool checkaccess, accmode_t accmode)
  772 {
  773         vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
  774         struct namecache *ncp;
  775         struct vnode *dvp;
  776         int error, nlen, lrulist;
  777         char *bp;
  778 
  779         KASSERT(vp != NULL);
  780 
  781         if (cache_maxlen == 0)
  782                 goto out;
  783 
  784         rw_enter(&vi->vi_nc_listlock, RW_READER);
  785         if (checkaccess) {
  786                 /*
  787                  * Check if the user is allowed to see.  NOTE: this is
  788                  * checking for access on the "wrong" directory.  getcwd()
  789                  * wants to see that there is access on every component
  790                  * along the way, not that there is access to any individual
  791                  * component.  Don't use this to check you can look in vp.
  792                  *
  793                  * I don't like it, I didn't come up with it, don't blame me!
  794                  */
  795                 if (vi->vi_nc_mode == VNOVAL) {
  796                         rw_exit(&vi->vi_nc_listlock);
  797                         return -1;
  798                 }
  799                 KASSERT(vi->vi_nc_uid != VNOVAL && vi->vi_nc_gid != VNOVAL);
  800                 error = kauth_authorize_vnode(kauth_cred_get(),
  801                     KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode &
  802                     ALLPERMS), vp, NULL, genfs_can_access(vp, curlwp->l_cred,
  803                     vi->vi_nc_uid, vi->vi_nc_gid, vi->vi_nc_mode & ALLPERMS,
  804                     NULL, accmode));
  805                     if (error != 0) {
  806                         rw_exit(&vi->vi_nc_listlock);
  807                         COUNT(ncs_denied);
  808                         return EACCES;
  809                 }
  810         }
  811         TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) {
  812                 KASSERT(ncp->nc_vp == vp);
  813                 KASSERT(ncp->nc_dvp != NULL);
  814                 nlen = ncp->nc_nlen;
  815 
  816                 /*
  817                  * Ignore mountpoint entries.
  818                  */
  819                 if (ncp->nc_nlen == cache_mp_nlen) {
  820                         continue;
  821                 }
  822 
  823                 /*
  824                  * The queue is partially sorted.  Once we hit dots, nothing
  825                  * else remains but dots and dotdots, so bail out.
  826                  */
  827                 if (ncp->nc_name[0] == '.') {
  828                         if (nlen == 1 ||
  829                             (nlen == 2 && ncp->nc_name[1] == '.')) {
  830                                 break;
  831                         }
  832                 }
  833 
  834                 /*
  835                  * Record a hit on the entry.  This is an unlocked read but
  836                  * even if wrong it doesn't matter too much.
  837                  */
  838                 lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
  839                 if (lrulist != LRU_ACTIVE) {
  840                         cache_activate(ncp);
  841                 }
  842 
  843                 if (bufp) {
  844                         bp = *bpp;
  845                         bp -= nlen;
  846                         if (bp <= bufp) {
  847                                 *dvpp = NULL;
  848                                 rw_exit(&vi->vi_nc_listlock);
  849                                 SDT_PROBE(vfs, namecache, revlookup,
  850                                     fail, vp, ERANGE, 0, 0, 0);
  851                                 return (ERANGE);
  852                         }
  853                         memcpy(bp, ncp->nc_name, nlen);
  854                         *bpp = bp;
  855                 }
  856 
  857                 dvp = ncp->nc_dvp;
  858                 error = vcache_tryvget(dvp);
  859                 rw_exit(&vi->vi_nc_listlock);
  860                 if (error) {
  861                         KASSERT(error == EBUSY);
  862                         if (bufp)
  863                                 (*bpp) += nlen;
  864                         *dvpp = NULL;
  865                         SDT_PROBE(vfs, namecache, revlookup, fail, vp,
  866                             error, 0, 0, 0);
  867                         return -1;
  868                 }
  869                 *dvpp = dvp;
  870                 SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
  871                     0, 0, 0);
  872                 COUNT(ncs_revhits);
  873                 return (0);
  874         }
  875         rw_exit(&vi->vi_nc_listlock);
  876         COUNT(ncs_revmiss);
  877  out:
  878         *dvpp = NULL;
  879         return (-1);
  880 }
  881 
  882 /*
  883  * Add an entry to the cache.
  884  */
  885 void
  886 cache_enter(struct vnode *dvp, struct vnode *vp,
  887             const char *name, size_t namelen, uint32_t cnflags)
  888 {
  889         vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
  890         struct namecache *ncp, *oncp;
  891         int total;
  892 
  893         KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
  894 
  895         /* First, check whether we can/should add a cache entry. */
  896         if ((cnflags & MAKEENTRY) == 0 ||
  897             __predict_false(namelen > cache_maxlen)) {
  898                 SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
  899                     0, 0);
  900                 return;
  901         }
  902 
  903         SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
  904 
  905         /*
  906          * Reclaim some entries if over budget.  This is an unlocked check,
  907          * but it doesn't matter.  Just need to catch up with things
  908          * eventually: it doesn't matter if we go over temporarily.
  909          */
  910         total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]);
  911         total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]);
  912         if (__predict_false(total > desiredvnodes)) {
  913                 cache_reclaim();
  914         }
  915 
  916         /* Now allocate a fresh entry. */
  917         if (__predict_true(namelen <= NCHNAMLEN)) {
  918                 ncp = pool_cache_get(cache_pool, PR_WAITOK);
  919         } else {
  920                 size_t sz = offsetof(struct namecache, nc_name[namelen]);
  921                 ncp = kmem_alloc(sz, KM_SLEEP);
  922         }
  923 
  924         /*
  925          * Fill in cache info.  For negative hits, save the ISWHITEOUT flag
  926          * so we can restore it later when the cache entry is used again.
  927          */
  928         ncp->nc_vp = vp;
  929         ncp->nc_dvp = dvp;
  930         ncp->nc_key = cache_key(name, namelen);
  931         ncp->nc_nlen = namelen;
  932         ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
  933         memcpy(ncp->nc_name, name, namelen);
  934 
  935         /*
  936          * Insert to the directory.  Concurrent lookups may race for a cache
  937          * entry.  If there's a entry there already, purge it.
  938          */
  939         rw_enter(&dvi->vi_nc_lock, RW_WRITER);
  940         oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
  941         if (oncp != ncp) {
  942                 KASSERT(oncp->nc_key == ncp->nc_key);
  943                 KASSERT(oncp->nc_nlen == ncp->nc_nlen);
  944                 KASSERT(memcmp(oncp->nc_name, name, namelen) == 0);
  945                 cache_remove(oncp, true);
  946                 oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
  947                 KASSERT(oncp == ncp);
  948         }
  949 
  950         /*
  951          * With the directory lock still held, insert to the tail of the
  952          * ACTIVE LRU list (new) and take the opportunity to incrementally
  953          * balance the lists.
  954          */
  955         mutex_enter(&cache_lru_lock);
  956         ncp->nc_lrulist = LRU_ACTIVE;
  957         cache_lru.count[LRU_ACTIVE]++;
  958         TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
  959         cache_deactivate();
  960         mutex_exit(&cache_lru_lock);
  961 
  962         /*
  963          * Finally, insert to the vnode and unlock.  With everything set up
  964          * it's safe to let cache_revlookup() see the entry.  Partially sort
  965          * the per-vnode list: dots go to back so cache_revlookup() doesn't
  966          * have to consider them.
  967          */
  968         if (vp != NULL) {
  969                 vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
  970                 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
  971                 if ((namelen == 1 && name[0] == '.') ||
  972                     (namelen == 2 && name[0] == '.' && name[1] == '.')) {
  973                         TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list);
  974                 } else {
  975                         TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list);
  976                 }
  977                 rw_exit(&vi->vi_nc_listlock);
  978         }
  979         rw_exit(&dvi->vi_nc_lock);
  980 }
  981 
  982 /*
  983  * Set identity info in cache for a vnode.  We only care about directories
  984  * so ignore other updates.  The cached info may be marked invalid if the
  985  * inode has an ACL.
  986  */
  987 void
  988 cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid, bool valid)
  989 {
  990         vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
  991 
  992         if (vp->v_type == VDIR) {
  993                 /* Grab both locks, for forward & reverse lookup. */
  994                 rw_enter(&vi->vi_nc_lock, RW_WRITER);
  995                 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
  996                 if (valid) {
  997                         vi->vi_nc_mode = mode;
  998                         vi->vi_nc_uid = uid;
  999                         vi->vi_nc_gid = gid;
 1000                 } else {
 1001                         vi->vi_nc_mode = VNOVAL;
 1002                         vi->vi_nc_uid = VNOVAL;
 1003                         vi->vi_nc_gid = VNOVAL;
 1004                 }
 1005                 rw_exit(&vi->vi_nc_listlock);
 1006                 rw_exit(&vi->vi_nc_lock);
 1007         }
 1008 }
 1009 
 1010 /*
 1011  * Return true if we have identity for the given vnode, and use as an
 1012  * opportunity to confirm that everything squares up.
 1013  *
 1014  * Because of shared code, some file systems could provide partial
 1015  * information, missing some updates, so check the mount flag too.
 1016  */
 1017 bool
 1018 cache_have_id(struct vnode *vp)
 1019 {
 1020 
 1021         if (vp->v_type == VDIR &&
 1022             (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0 &&
 1023             atomic_load_relaxed(&VNODE_TO_VIMPL(vp)->vi_nc_mode) != VNOVAL) {
 1024                 return true;
 1025         } else {
 1026                 return false;
 1027         }
 1028 }
 1029 
 1030 /*
 1031  * Enter a mount point.  cvp is the covered vnode, and rvp is the root of
 1032  * the mounted file system.
 1033  */
 1034 void
 1035 cache_enter_mount(struct vnode *cvp, struct vnode *rvp)
 1036 {
 1037 
 1038         KASSERT(vrefcnt(cvp) > 0);
 1039         KASSERT(vrefcnt(rvp) > 0);
 1040         KASSERT(cvp->v_type == VDIR);
 1041         KASSERT((rvp->v_vflag & VV_ROOT) != 0);
 1042 
 1043         if (rvp->v_type == VDIR) {
 1044                 cache_enter(cvp, rvp, cache_mp_name, cache_mp_nlen, MAKEENTRY);
 1045         }
 1046 }
 1047 
 1048 /*
 1049  * Look up a cached mount point.  Used in the strongly locked path.
 1050  */
 1051 bool
 1052 cache_lookup_mount(struct vnode *dvp, struct vnode **vn_ret)
 1053 {
 1054         bool ret;
 1055 
 1056         ret = cache_lookup(dvp, cache_mp_name, cache_mp_nlen, LOOKUP,
 1057             MAKEENTRY, NULL, vn_ret);
 1058         KASSERT((*vn_ret != NULL) == ret);
 1059         return ret;
 1060 }
 1061 
 1062 /*
 1063  * Try to cross a mount point.  For use with cache_lookup_linked().
 1064  */
 1065 bool
 1066 cache_cross_mount(struct vnode **dvp, krwlock_t **plock)
 1067 {
 1068 
 1069         return cache_lookup_linked(*dvp, cache_mp_name, cache_mp_nlen,
 1070            dvp, plock, FSCRED);
 1071 }
 1072 
 1073 /*
 1074  * Name cache initialization, from vfs_init() when the system is booting.
 1075  */
 1076 void
 1077 nchinit(void)
 1078 {
 1079 
 1080         cache_pool = pool_cache_init(sizeof(struct namecache),
 1081             coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL,
 1082             NULL, NULL);
 1083         KASSERT(cache_pool != NULL);
 1084 
 1085         mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
 1086         TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
 1087         TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);
 1088 
 1089         mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE);
 1090         callout_init(&cache_stat_callout, CALLOUT_MPSAFE);
 1091         callout_setfunc(&cache_stat_callout, cache_update_stats, NULL);
 1092         callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
 1093 
 1094         KASSERT(cache_sysctllog == NULL);
 1095         sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
 1096                        CTLFLAG_PERMANENT,
 1097                        CTLTYPE_STRUCT, "namecache_stats",
 1098                        SYSCTL_DESCR("namecache statistics"),
 1099                        cache_stat_sysctl, 0, NULL, 0,
 1100                        CTL_VFS, CTL_CREATE, CTL_EOL);
 1101 }
 1102 
 1103 /*
 1104  * Called once for each CPU in the system as attached.
 1105  */
 1106 void
 1107 cache_cpu_init(struct cpu_info *ci)
 1108 {
 1109         void *p;
 1110         size_t sz;
 1111 
 1112         sz = roundup2(sizeof(struct nchcpu), coherency_unit) + coherency_unit;
 1113         p = kmem_zalloc(sz, KM_SLEEP);
 1114         ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit);
 1115 }
 1116 
 1117 /*
 1118  * A vnode is being allocated: set up cache structures.
 1119  */
 1120 void
 1121 cache_vnode_init(struct vnode *vp)
 1122 {
 1123         vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
 1124 
 1125         rw_init(&vi->vi_nc_lock);
 1126         rw_init(&vi->vi_nc_listlock);
 1127         rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops);
 1128         TAILQ_INIT(&vi->vi_nc_list);
 1129         vi->vi_nc_mode = VNOVAL;
 1130         vi->vi_nc_uid = VNOVAL;
 1131         vi->vi_nc_gid = VNOVAL;
 1132 }
 1133 
 1134 /*
 1135  * A vnode is being freed: finish cache structures.
 1136  */
 1137 void
 1138 cache_vnode_fini(struct vnode *vp)
 1139 {
 1140         vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
 1141 
 1142         KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL);
 1143         KASSERT(TAILQ_EMPTY(&vi->vi_nc_list));
 1144         rw_destroy(&vi->vi_nc_lock);
 1145         rw_destroy(&vi->vi_nc_listlock);
 1146 }
 1147 
 1148 /*
 1149  * Helper for cache_purge1(): purge cache entries for the given vnode from
 1150  * all directories that the vnode is cached in.
 1151  */
 1152 static void
 1153 cache_purge_parents(struct vnode *vp)
 1154 {
 1155         vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp);
 1156         struct vnode *dvp, *blocked;
 1157         struct namecache *ncp;
 1158 
 1159         SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
 1160 
 1161         blocked = NULL;
 1162 
 1163         rw_enter(&vi->vi_nc_listlock, RW_WRITER);
 1164         while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) {
 1165                 /*
 1166                  * Locking in the wrong direction.  Try for a hold on the
 1167                  * directory node's lock, and if we get it then all good,
 1168                  * nuke the entry and move on to the next.
 1169                  */
 1170                 dvp = ncp->nc_dvp;
 1171                 dvi = VNODE_TO_VIMPL(dvp);
 1172                 if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
 1173                         cache_remove(ncp, false);
 1174                         rw_exit(&dvi->vi_nc_lock);
 1175                         blocked = NULL;
 1176                         continue;
 1177                 }
 1178 
 1179                 /*
 1180                  * We can't wait on the directory node's lock with our list
 1181                  * lock held or the system could deadlock.
 1182                  *
 1183                  * Take a hold on the directory vnode to prevent it from
 1184                  * being freed (taking the vnode & lock with it).  Then
 1185                  * wait for the lock to become available with no other locks
 1186                  * held, and retry.
 1187                  *
 1188                  * If this happens twice in a row, give the other side a
 1189                  * breather; we can do nothing until it lets go.
 1190                  */
 1191                 vhold(dvp);
 1192                 rw_exit(&vi->vi_nc_listlock);
 1193                 rw_enter(&dvi->vi_nc_lock, RW_WRITER);
 1194                 /* Do nothing. */
 1195                 rw_exit(&dvi->vi_nc_lock);
 1196                 holdrele(dvp);
 1197                 if (blocked == dvp) {
 1198                         kpause("ncpurge", false, 1, NULL);
 1199                 }
 1200                 rw_enter(&vi->vi_nc_listlock, RW_WRITER);
 1201                 blocked = dvp;
 1202         }
 1203         rw_exit(&vi->vi_nc_listlock);
 1204 }
 1205 
 1206 /*
 1207  * Helper for cache_purge1(): purge all cache entries hanging off the given
 1208  * directory vnode.
 1209  */
 1210 static void
 1211 cache_purge_children(struct vnode *dvp)
 1212 {
 1213         vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 1214         struct namecache *ncp;
 1215 
 1216         SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);
 1217 
 1218         rw_enter(&dvi->vi_nc_lock, RW_WRITER);
 1219         while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) {
 1220                 cache_remove(ncp, true);
 1221         }
 1222         rw_exit(&dvi->vi_nc_lock);
 1223 }
 1224 
 1225 /*
 1226  * Helper for cache_purge1(): purge cache entry from the given vnode,
 1227  * finding it by name.
 1228  */
 1229 static void
 1230 cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
 1231 {
 1232         vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 1233         struct namecache *ncp;
 1234         uint64_t key;
 1235 
 1236         SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
 1237 
 1238         key = cache_key(name, namelen);
 1239         rw_enter(&dvi->vi_nc_lock, RW_WRITER);
 1240         ncp = cache_lookup_entry(dvp, name, namelen, key);
 1241         if (ncp) {
 1242                 cache_remove(ncp, true);
 1243         }
 1244         rw_exit(&dvi->vi_nc_lock);
 1245 }
 1246 
 1247 /*
 1248  * Cache flush, a particular vnode; called when a vnode is renamed to
 1249  * hide entries that would now be invalid.
 1250  */
 1251 void
 1252 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
 1253 {
 1254 
 1255         if (flags & PURGE_PARENTS) {
 1256                 cache_purge_parents(vp);
 1257         }
 1258         if (flags & PURGE_CHILDREN) {
 1259                 cache_purge_children(vp);
 1260         }
 1261         if (name != NULL) {
 1262                 cache_purge_name(vp, name, namelen);
 1263         }
 1264 }
 1265 
 1266 /*
 1267  * vnode filter for cache_purgevfs().
 1268  */
 1269 static bool
 1270 cache_vdir_filter(void *cookie, vnode_t *vp)
 1271 {
 1272 
 1273         return vp->v_type == VDIR;
 1274 }
 1275 
 1276 /*
 1277  * Cache flush, a whole filesystem; called when filesys is umounted to
 1278  * remove entries that would now be invalid.
 1279  */
 1280 void
 1281 cache_purgevfs(struct mount *mp)
 1282 {
 1283         struct vnode_iterator *iter;
 1284         vnode_t *dvp;
 1285 
 1286         vfs_vnode_iterator_init(mp, &iter);
 1287         for (;;) {
 1288                 dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL);
 1289                 if (dvp == NULL) {
 1290                         break;
 1291                 }
 1292                 cache_purge_children(dvp);
 1293                 vrele(dvp);
 1294         }
 1295         vfs_vnode_iterator_destroy(iter);
 1296 }
 1297 
 1298 /*
 1299  * Re-queue an entry onto the tail of the active LRU list, after it has
 1300  * scored a hit.
 1301  */
 1302 static void
 1303 cache_activate(struct namecache *ncp)
 1304 {
 1305 
 1306         mutex_enter(&cache_lru_lock);
 1307         TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
 1308         TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
 1309         cache_lru.count[ncp->nc_lrulist]--;
 1310         cache_lru.count[LRU_ACTIVE]++;
 1311         ncp->nc_lrulist = LRU_ACTIVE;
 1312         mutex_exit(&cache_lru_lock);
 1313 }
 1314 
 1315 /*
 1316  * Try to balance the LRU lists.  Pick some victim entries, and re-queue
 1317  * them from the head of the active list to the tail of the inactive list.
 1318  */
 1319 static void
 1320 cache_deactivate(void)
 1321 {
 1322         struct namecache *ncp;
 1323         int total, i;
 1324 
 1325         KASSERT(mutex_owned(&cache_lru_lock));
 1326 
 1327         /* If we're nowhere near budget yet, don't bother. */
 1328         total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
 1329         if (total < (desiredvnodes >> 1)) {
 1330                 return;
 1331         }
 1332 
 1333         /*
 1334          * Aim for a 1:1 ratio of active to inactive.  This is to allow each
 1335          * potential victim a reasonable amount of time to cycle through the
 1336          * inactive list in order to score a hit and be reactivated, while
 1337          * trying not to cause reactivations too frequently.
 1338          */
 1339         if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
 1340                 return;
 1341         }
 1342 
 1343         /* Move only a few at a time; will catch up eventually. */
 1344         for (i = 0; i < cache_lru_maxdeact; i++) {
 1345                 ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
 1346                 if (ncp == NULL) {
 1347                         break;
 1348                 }
 1349                 KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
 1350                 ncp->nc_lrulist = LRU_INACTIVE;
 1351                 TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
 1352                 TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
 1353                 cache_lru.count[LRU_ACTIVE]--;
 1354                 cache_lru.count[LRU_INACTIVE]++;
 1355         }
 1356 }
 1357 
 1358 /*
 1359  * Free some entries from the cache, when we have gone over budget.
 1360  *
 1361  * We don't want to cause too much work for any individual caller, and it
 1362  * doesn't matter if we temporarily go over budget.  This is also "just a
 1363  * cache" so it's not a big deal if we screw up and throw out something we
 1364  * shouldn't.  So we take a relaxed attitude to this process to reduce its
 1365  * impact.
 1366  */
 1367 static void
 1368 cache_reclaim(void)
 1369 {
 1370         struct namecache *ncp;
 1371         vnode_impl_t *dvi;
 1372         int toscan;
 1373 
 1374         /*
 1375          * Scan up to a preset maximum number of entries, but no more than
 1376          * 0.8% of the total at once (to allow for very small systems).
 1377          *
 1378          * On bigger systems, do a larger chunk of work to reduce the number
 1379          * of times that cache_lru_lock is held for any length of time.
 1380          */
 1381         mutex_enter(&cache_lru_lock);
 1382         toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7);
 1383         toscan = MAX(toscan, 1);
 1384         SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] +
 1385             cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0);
 1386         while (toscan-- != 0) {
 1387                 /* First try to balance the lists. */
 1388                 cache_deactivate();
 1389 
 1390                 /* Now look for a victim on head of inactive list (old). */
 1391                 ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
 1392                 if (ncp == NULL) {
 1393                         break;
 1394                 }
 1395                 dvi = VNODE_TO_VIMPL(ncp->nc_dvp);
 1396                 KASSERT(ncp->nc_lrulist == LRU_INACTIVE);
 1397                 KASSERT(dvi != NULL);
 1398 
 1399                 /*
 1400                  * Locking in the wrong direction.  If we can't get the
 1401                  * lock, the directory is actively busy, and it could also
 1402                  * cause problems for the next guy in here, so send the
 1403                  * entry to the back of the list.
 1404                  */
 1405                 if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
 1406                         TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
 1407                             ncp, nc_lru);
 1408                         TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
 1409                             ncp, nc_lru);
 1410                         continue;
 1411                 }
 1412 
 1413                 /*
 1414                  * Now have the victim entry locked.  Drop the LRU list
 1415                  * lock, purge the entry, and start over.  The hold on
 1416                  * vi_nc_lock will prevent the vnode from vanishing until
 1417                  * finished (cache_purge() will be called on dvp before it
 1418                  * disappears, and that will wait on vi_nc_lock).
 1419                  */
 1420                 mutex_exit(&cache_lru_lock);
 1421                 cache_remove(ncp, true);
 1422                 rw_exit(&dvi->vi_nc_lock);
 1423                 mutex_enter(&cache_lru_lock);
 1424         }
 1425         mutex_exit(&cache_lru_lock);
 1426 }
 1427 
 1428 /*
 1429  * For file system code: count a lookup that required a full re-scan of
 1430  * directory metadata.
 1431  */
 1432 void
 1433 namecache_count_pass2(void)
 1434 {
 1435 
 1436         COUNT(ncs_pass2);
 1437 }
 1438 
 1439 /*
 1440  * For file system code: count a lookup that scored a hit in the directory
 1441  * metadata near the location of the last lookup.
 1442  */
 1443 void
 1444 namecache_count_2passes(void)
 1445 {
 1446 
 1447         COUNT(ncs_2passes);
 1448 }
 1449 
 1450 /*
 1451  * Sum the stats from all CPUs into nchstats.  This needs to run at least
 1452  * once within every window where a 32-bit counter could roll over.  It's
 1453  * called regularly by timer to ensure this.
 1454  */
 1455 static void
 1456 cache_update_stats(void *cookie)
 1457 {
 1458         CPU_INFO_ITERATOR cii;
 1459         struct cpu_info *ci;
 1460 
 1461         mutex_enter(&cache_stat_lock);
 1462         for (CPU_INFO_FOREACH(cii, ci)) {
 1463                 struct nchcpu *nchcpu = ci->ci_data.cpu_nch;
 1464                 UPDATE(nchcpu, ncs_goodhits);
 1465                 UPDATE(nchcpu, ncs_neghits);
 1466                 UPDATE(nchcpu, ncs_badhits);
 1467                 UPDATE(nchcpu, ncs_falsehits);
 1468                 UPDATE(nchcpu, ncs_miss);
 1469                 UPDATE(nchcpu, ncs_long);
 1470                 UPDATE(nchcpu, ncs_pass2);
 1471                 UPDATE(nchcpu, ncs_2passes);
 1472                 UPDATE(nchcpu, ncs_revhits);
 1473                 UPDATE(nchcpu, ncs_revmiss);
 1474                 UPDATE(nchcpu, ncs_denied);
 1475         }
 1476         if (cookie != NULL) {
 1477                 memcpy(cookie, &nchstats, sizeof(nchstats));
 1478         }
 1479         /* Reset the timer; arrive back here in N minutes at latest. */
 1480         callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
 1481         mutex_exit(&cache_stat_lock);
 1482 }
 1483 
 1484 /*
 1485  * Fetch the current values of the stats for sysctl.
 1486  */
 1487 static int
 1488 cache_stat_sysctl(SYSCTLFN_ARGS)
 1489 {
 1490         struct nchstats stats;
 1491 
 1492         if (oldp == NULL) {
 1493                 *oldlenp = sizeof(nchstats);
 1494                 return 0;
 1495         }
 1496 
 1497         if (*oldlenp <= 0) {
 1498                 *oldlenp = 0;
 1499                 return 0;
 1500         }
 1501 
 1502         /* Refresh the global stats. */
 1503         sysctl_unlock();
 1504         cache_update_stats(&stats);
 1505         sysctl_relock();
 1506 
 1507         *oldlenp = MIN(sizeof(stats), *oldlenp);
 1508         return sysctl_copyout(l, &stats, oldp, *oldlenp);
 1509 }
 1510 
 1511 /*
 1512  * For the debugger, given the address of a vnode, print all associated
 1513  * names in the cache.
 1514  */
 1515 #ifdef DDB
 1516 void
 1517 namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
 1518 {
 1519         struct vnode *dvp = NULL;
 1520         struct namecache *ncp;
 1521         enum cache_lru_id id;
 1522 
 1523         for (id = 0; id < LRU_COUNT; id++) {
 1524                 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
 1525                         if (ncp->nc_vp == vp) {
 1526                                 (*pr)("name %.*s\n", ncp->nc_nlen,
 1527                                     ncp->nc_name);
 1528                                 dvp = ncp->nc_dvp;
 1529                         }
 1530                 }
 1531         }
 1532         if (dvp == NULL) {
 1533                 (*pr)("name not found\n");
 1534                 return;
 1535         }
 1536         for (id = 0; id < LRU_COUNT; id++) {
 1537                 TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
 1538                         if (ncp->nc_vp == dvp) {
 1539                                 (*pr)("parent %.*s\n", ncp->nc_nlen,
 1540                                     ncp->nc_name);
 1541                         }
 1542                 }
 1543         }
 1544 }
 1545 #endif
Cache object: 41f137bc31016dd61ded2399d183349a
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_cache.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c