The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
    3  * 
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Matthew Dillon <dillon@backplane.com>
    6  * 
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in
   15  *    the documentation and/or other materials provided with the
   16  *    distribution.
   17  * 3. Neither the name of The DragonFly Project nor the names of its
   18  *    contributors may be used to endorse or promote products derived
   19  *    from this software without specific, prior written permission.
   20  * 
   21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  * 
   34  * Copyright (c) 1989, 1993, 1995
   35  *      The Regents of the University of California.  All rights reserved.
   36  *
   37  * This code is derived from software contributed to Berkeley by
   38  * Poul-Henning Kamp of the FreeBSD Project.
   39  *
   40  * Redistribution and use in source and binary forms, with or without
   41  * modification, are permitted provided that the following conditions
   42  * are met:
   43  * 1. Redistributions of source code must retain the above copyright
   44  *    notice, this list of conditions and the following disclaimer.
   45  * 2. Redistributions in binary form must reproduce the above copyright
   46  *    notice, this list of conditions and the following disclaimer in the
   47  *    documentation and/or other materials provided with the distribution.
   48  * 3. Neither the name of the University nor the names of its contributors
   49  *    may be used to endorse or promote products derived from this software
   50  *    without specific prior written permission.
   51  *
   52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   62  * SUCH DAMAGE.
   63  */
   64 
   65 #include <sys/param.h>
   66 #include <sys/systm.h>
   67 #include <sys/kernel.h>
   68 #include <sys/sysctl.h>
   69 #include <sys/mount.h>
   70 #include <sys/vnode.h>
   71 #include <sys/malloc.h>
   72 #include <sys/sysproto.h>
   73 #include <sys/spinlock.h>
   74 #include <sys/proc.h>
   75 #include <sys/namei.h>
   76 #include <sys/nlookup.h>
   77 #include <sys/filedesc.h>
   78 #include <sys/fnv_hash.h>
   79 #include <sys/globaldata.h>
   80 #include <sys/kern_syscall.h>
   81 #include <sys/dirent.h>
   82 #include <ddb/ddb.h>
   83 
   84 #include <sys/sysref2.h>
   85 #include <sys/spinlock2.h>
   86 #include <sys/mplock2.h>
   87 
   88 #define MAX_RECURSION_DEPTH     64
   89 
   90 /*
   91  * Random lookups in the cache are accomplished with a hash table using
   92  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock.
   93  *
   94  * Negative entries may exist and correspond to resolved namecache
   95  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
   96  * will be set if the entry corresponds to a whited-out directory entry
   97  * (verses simply not finding the entry at all).   ncneglist is locked
   98  * with a global spinlock (ncspin).
   99  *
  100  * MPSAFE RULES:
  101  *
  102  * (1) A ncp must be referenced before it can be locked.
  103  *
  104  * (2) A ncp must be locked in order to modify it.
  105  *
  106  * (3) ncp locks are always ordered child -> parent.  That may seem
  107  *     backwards but forward scans use the hash table and thus can hold
  108  *     the parent unlocked when traversing downward.
  109  *
  110  *     This allows insert/rename/delete/dot-dot and other operations
  111  *     to use ncp->nc_parent links.
  112  *
  113  *     This also prevents a locked up e.g. NFS node from creating a
  114  *     chain reaction all the way back to the root vnode / namecache.
  115  *
  116  * (4) parent linkages require both the parent and child to be locked.
  117  */
  118 
  119 /*
  120  * Structures associated with name cacheing.
  121  */
  122 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
  123 #define MINNEG                  1024
  124 #define MINPOS                  1024
  125 #define NCMOUNT_NUMCACHE        1009    /* prime number */
  126 
  127 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
  128 
  129 LIST_HEAD(nchash_list, namecache);
  130 
  131 struct nchash_head {
  132        struct nchash_list list;
  133        struct spinlock  spin;
  134 };
  135 
  136 struct ncmount_cache {
  137         struct spinlock spin;
  138         struct namecache *ncp;
  139         struct mount *mp;
  140         int isneg;              /* if != 0 mp is originator and not target */
  141 };
  142 
  143 static struct nchash_head       *nchashtbl;
  144 static struct namecache_list    ncneglist;
  145 static struct spinlock          ncspin;
  146 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
  147 
  148 /*
  149  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
  150  * to create the namecache infrastructure leading to a dangling vnode.
  151  *
  152  * 0    Only errors are reported
  153  * 1    Successes are reported
  154  * 2    Successes + the whole directory scan is reported
  155  * 3    Force the directory scan code run as if the parent vnode did not
  156  *      have a namecache record, even if it does have one.
  157  */
  158 static int      ncvp_debug;
  159 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
  160     "Namecache debug level (0-3)");
  161 
  162 static u_long   nchash;                 /* size of hash table */
  163 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
  164     "Size of namecache hash table");
  165 
  166 static int      ncnegflush = 10;        /* burst for negative flush */
  167 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
  168     "Batch flush negative entries");
  169 
  170 static int      ncposflush = 10;        /* burst for positive flush */
  171 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
  172     "Batch flush positive entries");
  173 
  174 static int      ncnegfactor = 16;       /* ratio of negative entries */
  175 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
  176     "Ratio of namecache negative entries");
  177 
  178 static int      nclockwarn;             /* warn on locked entries in ticks */
  179 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
  180     "Warn on locked namecache entries in ticks");
  181 
  182 static int      numdefered;             /* number of cache entries allocated */
  183 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
  184     "Number of cache entries allocated");
  185 
  186 static int      ncposlimit;             /* number of cache entries allocated */
  187 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
  188     "Number of cache entries allocated");
  189 
  190 static int      ncp_shared_lock_disable = 0;
  191 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
  192            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
  193 
  194 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
  195     "sizeof(struct vnode)");
  196 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
  197     "sizeof(struct namecache)");
  198 
  199 static int      ncmount_cache_enable = 1;
  200 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
  201            &ncmount_cache_enable, 0, "mount point cache");
  202 static long     ncmount_cache_hit;
  203 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW,
  204             &ncmount_cache_hit, 0, "mpcache hits");
  205 static long     ncmount_cache_miss;
  206 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW,
  207             &ncmount_cache_miss, 0, "mpcache misses");
  208 static long     ncmount_cache_overwrite;
  209 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW,
  210             &ncmount_cache_overwrite, 0, "mpcache entry overwrites");
  211 
  212 static int cache_resolve_mp(struct mount *mp);
  213 static struct vnode *cache_dvpref(struct namecache *ncp);
  214 static void _cache_lock(struct namecache *ncp);
  215 static void _cache_setunresolved(struct namecache *ncp);
  216 static void _cache_cleanneg(int count);
  217 static void _cache_cleanpos(int count);
  218 static void _cache_cleandefered(void);
  219 static void _cache_unlink(struct namecache *ncp);
  220 
  221 /*
  222  * The new name cache statistics
  223  */
  224 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
  225 static int numneg;
  226 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
  227     "Number of negative namecache entries");
  228 static int numcache;
  229 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
  230     "Number of namecaches entries");
  231 static u_long numcalls;
  232 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
  233     "Number of namecache lookups");
  234 static u_long numchecks;
  235 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
  236     "Number of checked entries in namecache lookups");
  237 
  238 struct nchstats nchstats[SMP_MAXCPU];
  239 /*
  240  * Export VFS cache effectiveness statistics to user-land.
  241  *
  242  * The statistics are left for aggregation to user-land so
  243  * neat things can be achieved, like observing per-CPU cache
  244  * distribution.
  245  */
  246 static int
  247 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
  248 {
  249         struct globaldata *gd;
  250         int i, error;
  251 
  252         error = 0;
  253         for (i = 0; i < ncpus; ++i) {
  254                 gd = globaldata_find(i);
  255                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
  256                         sizeof(struct nchstats))))
  257                         break;
  258         }
  259 
  260         return (error);
  261 }
  262 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
  263   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
  264 
  265 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
  266 
  267 /*
  268  * Namespace locking.  The caller must already hold a reference to the
  269  * namecache structure in order to lock/unlock it.  This function prevents
  270  * the namespace from being created or destroyed by accessors other then
  271  * the lock holder.
  272  *
  273  * Note that holding a locked namecache structure prevents other threads
  274  * from making namespace changes (e.g. deleting or creating), prevents
  275  * vnode association state changes by other threads, and prevents the
  276  * namecache entry from being resolved or unresolved by other threads.
  277  *
  278  * An exclusive lock owner has full authority to associate/disassociate
  279  * vnodes and resolve/unresolve the locked ncp.
  280  *
  281  * A shared lock owner only has authority to acquire the underlying vnode,
  282  * if any.
  283  *
  284  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
  285  * fact (when locking) or cleared prior to unlocking.
  286  *
  287  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
  288  *           or recycled, but it does NOT help you if the vnode had already
  289  *           initiated a recyclement.  If this is important, use cache_get()
  290  *           rather then cache_lock() (and deal with the differences in the
  291  *           way the refs counter is handled).  Or, alternatively, make an
  292  *           unconditional call to cache_validate() or cache_resolve()
  293  *           after cache_lock() returns.
  294  */
  295 static
  296 void
  297 _cache_lock(struct namecache *ncp)
  298 {
  299         thread_t td;
  300         int didwarn;
  301         int begticks;
  302         int error;
  303         u_int count;
  304 
  305         KKASSERT(ncp->nc_refs != 0);
  306         didwarn = 0;
  307         begticks = 0;
  308         td = curthread;
  309 
  310         for (;;) {
  311                 count = ncp->nc_lockstatus;
  312                 cpu_ccfence();
  313 
  314                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
  315                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  316                                               count, count + 1)) {
  317                                 /*
  318                                  * The vp associated with a locked ncp must
  319                                  * be held to prevent it from being recycled.
  320                                  *
  321                                  * WARNING!  If VRECLAIMED is set the vnode
  322                                  * could already be in the middle of a recycle.
  323                                  * Callers must use cache_vref() or
  324                                  * cache_vget() on the locked ncp to
  325                                  * validate the vp or set the cache entry
  326                                  * to unresolved.
  327                                  *
  328                                  * NOTE! vhold() is allowed if we hold a
  329                                  *       lock on the ncp (which we do).
  330                                  */
  331                                 ncp->nc_locktd = td;
  332                                 if (ncp->nc_vp)
  333                                         vhold(ncp->nc_vp);
  334                                 break;
  335                         }
  336                         /* cmpset failed */
  337                         continue;
  338                 }
  339                 if (ncp->nc_locktd == td) {
  340                         KKASSERT((count & NC_SHLOCK_FLAG) == 0);
  341                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  342                                               count, count + 1)) {
  343                                 break;
  344                         }
  345                         /* cmpset failed */
  346                         continue;
  347                 }
  348                 tsleep_interlock(&ncp->nc_locktd, 0);
  349                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
  350                                       count | NC_EXLOCK_REQ) == 0) {
  351                         /* cmpset failed */
  352                         continue;
  353                 }
  354                 if (begticks == 0)
  355                         begticks = ticks;
  356                 error = tsleep(&ncp->nc_locktd, PINTERLOCKED,
  357                                "clock", nclockwarn);
  358                 if (error == EWOULDBLOCK) {
  359                         if (didwarn == 0) {
  360                                 didwarn = ticks;
  361                                 kprintf("[diagnostic] cache_lock: "
  362                                         "blocked on %p %08x",
  363                                         ncp, count);
  364                                 kprintf(" \"%*.*s\"\n",
  365                                         ncp->nc_nlen, ncp->nc_nlen,
  366                                         ncp->nc_name);
  367                         }
  368                 }
  369                 /* loop */
  370         }
  371         if (didwarn) {
  372                 kprintf("[diagnostic] cache_lock: unblocked %*.*s after "
  373                         "%d secs\n",
  374                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
  375                         (int)(ticks + (hz / 2) - begticks) / hz);
  376         }
  377 }
  378 
  379 /*
  380  * The shared lock works similarly to the exclusive lock except
  381  * nc_locktd is left NULL and we need an interlock (VHOLD) to
  382  * prevent vhold() races, since the moment our cmpset_int succeeds
  383  * another cpu can come in and get its own shared lock.
  384  *
  385  * A critical section is needed to prevent interruption during the
  386  * VHOLD interlock.
  387  */
  388 static
  389 void
  390 _cache_lock_shared(struct namecache *ncp)
  391 {
  392         int didwarn;
  393         int error;
  394         u_int count;
  395         u_int optreq = NC_EXLOCK_REQ;
  396 
  397         KKASSERT(ncp->nc_refs != 0);
  398         didwarn = 0;
  399 
  400         for (;;) {
  401                 count = ncp->nc_lockstatus;
  402                 cpu_ccfence();
  403 
  404                 if ((count & ~NC_SHLOCK_REQ) == 0) {
  405                         crit_enter();
  406                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  407                                       count,
  408                                       (count + 1) | NC_SHLOCK_FLAG |
  409                                                     NC_SHLOCK_VHOLD)) {
  410                                 /*
  411                                  * The vp associated with a locked ncp must
  412                                  * be held to prevent it from being recycled.
  413                                  *
  414                                  * WARNING!  If VRECLAIMED is set the vnode
  415                                  * could already be in the middle of a recycle.
  416                                  * Callers must use cache_vref() or
  417                                  * cache_vget() on the locked ncp to
  418                                  * validate the vp or set the cache entry
  419                                  * to unresolved.
  420                                  *
  421                                  * NOTE! vhold() is allowed if we hold a
  422                                  *       lock on the ncp (which we do).
  423                                  */
  424                                 if (ncp->nc_vp)
  425                                         vhold(ncp->nc_vp);
  426                                 atomic_clear_int(&ncp->nc_lockstatus,
  427                                                  NC_SHLOCK_VHOLD);
  428                                 crit_exit();
  429                                 break;
  430                         }
  431                         /* cmpset failed */
  432                         crit_exit();
  433                         continue;
  434                 }
  435 
  436                 /*
  437                  * If already held shared we can just bump the count, but
  438                  * only allow this if nobody is trying to get the lock
  439                  * exclusively.  If we are blocking too long ignore excl
  440                  * requests (which can race/deadlock us).
  441                  *
  442                  * VHOLD is a bit of a hack.  Even though we successfully
  443                  * added another shared ref, the cpu that got the first
  444                  * shared ref might not yet have held the vnode.
  445                  */
  446                 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) {
  447                         KKASSERT((count & ~(NC_EXLOCK_REQ |
  448                                             NC_SHLOCK_REQ |
  449                                             NC_SHLOCK_FLAG)) > 0);
  450                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  451                                               count, count + 1)) {
  452                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
  453                                         cpu_pause();
  454                                 break;
  455                         }
  456                         continue;
  457                 }
  458                 tsleep_interlock(ncp, 0);
  459                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
  460                                       count | NC_SHLOCK_REQ) == 0) {
  461                         /* cmpset failed */
  462                         continue;
  463                 }
  464                 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn);
  465                 if (error == EWOULDBLOCK) {
  466                         optreq = 0;
  467                         if (didwarn == 0) {
  468                                 didwarn = ticks;
  469                                 kprintf("[diagnostic] cache_lock_shared: "
  470                                         "blocked on %p %08x",
  471                                         ncp, count);
  472                                 kprintf(" \"%*.*s\"\n",
  473                                         ncp->nc_nlen, ncp->nc_nlen,
  474                                         ncp->nc_name);
  475                         }
  476                 }
  477                 /* loop */
  478         }
  479         if (didwarn) {
  480                 kprintf("[diagnostic] cache_lock_shared: "
  481                         "unblocked %*.*s after %d secs\n",
  482                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
  483                         (int)(ticks - didwarn) / hz);
  484         }
  485 }
  486 
  487 /*
  488  * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
  489  *       such as the case where one of its children is locked.
  490  */
  491 static
  492 int
  493 _cache_lock_nonblock(struct namecache *ncp)
  494 {
  495         thread_t td;
  496         u_int count;
  497 
  498         td = curthread;
  499 
  500         for (;;) {
  501                 count = ncp->nc_lockstatus;
  502 
  503                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
  504                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  505                                               count, count + 1)) {
  506                                 /*
  507                                  * The vp associated with a locked ncp must
  508                                  * be held to prevent it from being recycled.
  509                                  *
  510                                  * WARNING!  If VRECLAIMED is set the vnode
  511                                  * could already be in the middle of a recycle.
  512                                  * Callers must use cache_vref() or
  513                                  * cache_vget() on the locked ncp to
  514                                  * validate the vp or set the cache entry
  515                                  * to unresolved.
  516                                  *
  517                                  * NOTE! vhold() is allowed if we hold a
  518                                  *       lock on the ncp (which we do).
  519                                  */
  520                                 ncp->nc_locktd = td;
  521                                 if (ncp->nc_vp)
  522                                         vhold(ncp->nc_vp);
  523                                 break;
  524                         }
  525                         /* cmpset failed */
  526                         continue;
  527                 }
  528                 if (ncp->nc_locktd == td) {
  529                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  530                                               count, count + 1)) {
  531                                 break;
  532                         }
  533                         /* cmpset failed */
  534                         continue;
  535                 }
  536                 return(EWOULDBLOCK);
  537         }
  538         return(0);
  539 }
  540 
  541 /*
  542  * The shared lock works similarly to the exclusive lock except
  543  * nc_locktd is left NULL and we need an interlock (VHOLD) to
  544  * prevent vhold() races, since the moment our cmpset_int succeeds
  545  * another cpu can come in and get its own shared lock.
  546  *
  547  * A critical section is needed to prevent interruption during the
  548  * VHOLD interlock.
  549  */
  550 static
  551 int
  552 _cache_lock_shared_nonblock(struct namecache *ncp)
  553 {
  554         u_int count;
  555 
  556         for (;;) {
  557                 count = ncp->nc_lockstatus;
  558 
  559                 if ((count & ~NC_SHLOCK_REQ) == 0) {
  560                         crit_enter();
  561                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  562                                       count,
  563                                       (count + 1) | NC_SHLOCK_FLAG |
  564                                                     NC_SHLOCK_VHOLD)) {
  565                                 /*
  566                                  * The vp associated with a locked ncp must
  567                                  * be held to prevent it from being recycled.
  568                                  *
  569                                  * WARNING!  If VRECLAIMED is set the vnode
  570                                  * could already be in the middle of a recycle.
  571                                  * Callers must use cache_vref() or
  572                                  * cache_vget() on the locked ncp to
  573                                  * validate the vp or set the cache entry
  574                                  * to unresolved.
  575                                  *
  576                                  * NOTE! vhold() is allowed if we hold a
  577                                  *       lock on the ncp (which we do).
  578                                  */
  579                                 if (ncp->nc_vp)
  580                                         vhold(ncp->nc_vp);
  581                                 atomic_clear_int(&ncp->nc_lockstatus,
  582                                                  NC_SHLOCK_VHOLD);
  583                                 crit_exit();
  584                                 break;
  585                         }
  586                         /* cmpset failed */
  587                         crit_exit();
  588                         continue;
  589                 }
  590 
  591                 /*
  592                  * If already held shared we can just bump the count, but
  593                  * only allow this if nobody is trying to get the lock
  594                  * exclusively.
  595                  *
  596                  * VHOLD is a bit of a hack.  Even though we successfully
  597                  * added another shared ref, the cpu that got the first
  598                  * shared ref might not yet have held the vnode.
  599                  */
  600                 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
  601                     NC_SHLOCK_FLAG) {
  602                         KKASSERT((count & ~(NC_EXLOCK_REQ |
  603                                             NC_SHLOCK_REQ |
  604                                             NC_SHLOCK_FLAG)) > 0);
  605                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  606                                               count, count + 1)) {
  607                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
  608                                         cpu_pause();
  609                                 break;
  610                         }
  611                         continue;
  612                 }
  613                 return(EWOULDBLOCK);
  614         }
  615         return(0);
  616 }
  617 
  618 /*
  619  * Helper function
  620  *
  621  * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
  622  *
  623  *       nc_locktd must be NULLed out prior to nc_lockstatus getting cleared.
  624  */
  625 static
  626 void
  627 _cache_unlock(struct namecache *ncp)
  628 {
  629         thread_t td __debugvar = curthread;
  630         u_int count;
  631         u_int ncount;
  632         struct vnode *dropvp;
  633 
  634         KKASSERT(ncp->nc_refs >= 0);
  635         KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0);
  636         KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td);
  637 
  638         count = ncp->nc_lockstatus;
  639         cpu_ccfence();
  640 
  641         /*
  642          * Clear nc_locktd prior to the atomic op (excl lock only)
  643          */
  644         if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1)
  645                 ncp->nc_locktd = NULL;
  646         dropvp = NULL;
  647 
  648         for (;;) {
  649                 if ((count &
  650                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) {
  651                         dropvp = ncp->nc_vp;
  652                         if (count & NC_EXLOCK_REQ)
  653                                 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */
  654                         else
  655                                 ncount = 0;
  656 
  657                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  658                                               count, ncount)) {
  659                                 if (count & NC_EXLOCK_REQ)
  660                                         wakeup(&ncp->nc_locktd);
  661                                 else if (count & NC_SHLOCK_REQ)
  662                                         wakeup(ncp);
  663                                 break;
  664                         }
  665                         dropvp = NULL;
  666                 } else {
  667                         KKASSERT((count & NC_SHLOCK_VHOLD) == 0);
  668                         KKASSERT((count & ~(NC_EXLOCK_REQ |
  669                                             NC_SHLOCK_REQ |
  670                                             NC_SHLOCK_FLAG)) > 1);
  671                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
  672                                               count, count - 1)) {
  673                                 break;
  674                         }
  675                 }
  676                 count = ncp->nc_lockstatus;
  677                 cpu_ccfence();
  678         }
  679 
  680         /*
  681          * Don't actually drop the vp until we successfully clean out
  682          * the lock, otherwise we may race another shared lock.
  683          */
  684         if (dropvp)
  685                 vdrop(dropvp);
  686 }
  687 
  688 static
  689 int
  690 _cache_lockstatus(struct namecache *ncp)
  691 {
  692         if (ncp->nc_locktd == curthread)
  693                 return(LK_EXCLUSIVE);
  694         if (ncp->nc_lockstatus & NC_SHLOCK_FLAG)
  695                 return(LK_SHARED);
  696         return(-1);
  697 }
  698 
  699 /*
  700  * cache_hold() and cache_drop() prevent the premature deletion of a
  701  * namecache entry but do not prevent operations (such as zapping) on
  702  * that namecache entry.
  703  *
  704  * This routine may only be called from outside this source module if
  705  * nc_refs is already at least 1.
  706  *
  707  * This is a rare case where callers are allowed to hold a spinlock,
  708  * so we can't ourselves.
  709  */
  710 static __inline
  711 struct namecache *
  712 _cache_hold(struct namecache *ncp)
  713 {
  714         atomic_add_int(&ncp->nc_refs, 1);
  715         return(ncp);
  716 }
  717 
  718 /*
  719  * Drop a cache entry, taking care to deal with races.
  720  *
  721  * For potential 1->0 transitions we must hold the ncp lock to safely
  722  * test its flags.  An unresolved entry with no children must be zapped
  723  * to avoid leaks.
  724  *
  725  * The call to cache_zap() itself will handle all remaining races and
  726  * will decrement the ncp's refs regardless.  If we are resolved or
  727  * have children nc_refs can safely be dropped to 0 without having to
  728  * zap the entry.
  729  *
  730  * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
  731  *
  732  * NOTE: cache_zap() may return a non-NULL referenced parent which must
  733  *       be dropped in a loop.
  734  */
  735 static __inline
  736 void
  737 _cache_drop(struct namecache *ncp)
  738 {
  739         int refs;
  740 
  741         while (ncp) {
  742                 KKASSERT(ncp->nc_refs > 0);
  743                 refs = ncp->nc_refs;
  744 
  745                 if (refs == 1) {
  746                         if (_cache_lock_nonblock(ncp) == 0) {
  747                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
  748                                 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
  749                                     TAILQ_EMPTY(&ncp->nc_list)) {
  750                                         ncp = cache_zap(ncp, 1);
  751                                         continue;
  752                                 }
  753                                 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
  754                                         _cache_unlock(ncp);
  755                                         break;
  756                                 }
  757                                 _cache_unlock(ncp);
  758                         }
  759                 } else {
  760                         if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
  761                                 break;
  762                 }
  763                 cpu_pause();
  764         }
  765 }
  766 
  767 /*
  768  * Link a new namecache entry to its parent and to the hash table.  Be
  769  * careful to avoid races if vhold() blocks in the future.
  770  *
  771  * Both ncp and par must be referenced and locked.
  772  *
  773  * NOTE: The hash table spinlock is held during this call, we can't do
  774  *       anything fancy.
  775  */
  776 static void
  777 _cache_link_parent(struct namecache *ncp, struct namecache *par,
  778                    struct nchash_head *nchpp)
  779 {
  780         KKASSERT(ncp->nc_parent == NULL);
  781         ncp->nc_parent = par;
  782         ncp->nc_head = nchpp;
  783 
  784         /*
  785          * Set inheritance flags.  Note that the parent flags may be
  786          * stale due to getattr potentially not having been run yet
  787          * (it gets run during nlookup()'s).
  788          */
  789         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
  790         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
  791                 ncp->nc_flag |= NCF_SF_PNOCACHE;
  792         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
  793                 ncp->nc_flag |= NCF_UF_PCACHE;
  794 
  795         LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
  796 
  797         if (TAILQ_EMPTY(&par->nc_list)) {
  798                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
  799                 /*
  800                  * Any vp associated with an ncp which has children must
  801                  * be held to prevent it from being recycled.
  802                  */
  803                 if (par->nc_vp)
  804                         vhold(par->nc_vp);
  805         } else {
  806                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
  807         }
  808 }
  809 
  810 /*
  811  * Remove the parent and hash associations from a namecache structure.
  812  * If this is the last child of the parent the cache_drop(par) will
  813  * attempt to recursively zap the parent.
  814  *
  815  * ncp must be locked.  This routine will acquire a temporary lock on
  816  * the parent as wlel as the appropriate hash chain.
  817  */
  818 static void
  819 _cache_unlink_parent(struct namecache *ncp)
  820 {
  821         struct namecache *par;
  822         struct vnode *dropvp;
  823 
  824         if ((par = ncp->nc_parent) != NULL) {
  825                 KKASSERT(ncp->nc_parent == par);
  826                 _cache_hold(par);
  827                 _cache_lock(par);
  828                 spin_lock(&ncp->nc_head->spin);
  829                 LIST_REMOVE(ncp, nc_hash);
  830                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
  831                 dropvp = NULL;
  832                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
  833                         dropvp = par->nc_vp;
  834                 spin_unlock(&ncp->nc_head->spin);
  835                 ncp->nc_parent = NULL;
  836                 ncp->nc_head = NULL;
  837                 _cache_unlock(par);
  838                 _cache_drop(par);
  839 
  840                 /*
  841                  * We can only safely vdrop with no spinlocks held.
  842                  */
  843                 if (dropvp)
  844                         vdrop(dropvp);
  845         }
  846 }
  847 
  848 /*
  849  * Allocate a new namecache structure.  Most of the code does not require
  850  * zero-termination of the string but it makes vop_compat_ncreate() easier.
  851  */
  852 static struct namecache *
  853 cache_alloc(int nlen)
  854 {
  855         struct namecache *ncp;
  856 
  857         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
  858         if (nlen)
  859                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
  860         ncp->nc_nlen = nlen;
  861         ncp->nc_flag = NCF_UNRESOLVED;
  862         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
  863         ncp->nc_refs = 1;
  864 
  865         TAILQ_INIT(&ncp->nc_list);
  866         _cache_lock(ncp);
  867         return(ncp);
  868 }
  869 
  870 /*
  871  * Can only be called for the case where the ncp has never been
  872  * associated with anything (so no spinlocks are needed).
  873  */
  874 static void
  875 _cache_free(struct namecache *ncp)
  876 {
  877         KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1);
  878         if (ncp->nc_name)
  879                 kfree(ncp->nc_name, M_VFSCACHE);
  880         kfree(ncp, M_VFSCACHE);
  881 }
  882 
  883 /*
  884  * [re]initialize a nchandle.
  885  */
  886 void
  887 cache_zero(struct nchandle *nch)
  888 {
  889         nch->ncp = NULL;
  890         nch->mount = NULL;
  891 }
  892 
  893 /*
  894  * Ref and deref a namecache structure.
  895  *
  896  * The caller must specify a stable ncp pointer, typically meaning the
  897  * ncp is already referenced but this can also occur indirectly through
  898  * e.g. holding a lock on a direct child.
  899  *
  900  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
  901  *          use read spinlocks here.
  902  *
  903  * MPSAFE if nch is
  904  */
  905 struct nchandle *
  906 cache_hold(struct nchandle *nch)
  907 {
  908         _cache_hold(nch->ncp);
  909         atomic_add_int(&nch->mount->mnt_refs, 1);
  910         return(nch);
  911 }
  912 
  913 /*
  914  * Create a copy of a namecache handle for an already-referenced
  915  * entry.
  916  *
  917  * MPSAFE if nch is
  918  */
  919 void
  920 cache_copy(struct nchandle *nch, struct nchandle *target)
  921 {
  922         *target = *nch;
  923         if (target->ncp)
  924                 _cache_hold(target->ncp);
  925         atomic_add_int(&nch->mount->mnt_refs, 1);
  926 }
  927 
  928 /*
  929  * MPSAFE if nch is
  930  */
  931 void
  932 cache_changemount(struct nchandle *nch, struct mount *mp)
  933 {
  934         atomic_add_int(&nch->mount->mnt_refs, -1);
  935         nch->mount = mp;
  936         atomic_add_int(&nch->mount->mnt_refs, 1);
  937 }
  938 
  939 void
  940 cache_drop(struct nchandle *nch)
  941 {
  942         atomic_add_int(&nch->mount->mnt_refs, -1);
  943         _cache_drop(nch->ncp);
  944         nch->ncp = NULL;
  945         nch->mount = NULL;
  946 }
  947 
  948 int
  949 cache_lockstatus(struct nchandle *nch)
  950 {
  951         return(_cache_lockstatus(nch->ncp));
  952 }
  953 
  954 void
  955 cache_lock(struct nchandle *nch)
  956 {
  957         _cache_lock(nch->ncp);
  958 }
  959 
  960 void
  961 cache_lock_maybe_shared(struct nchandle *nch, int excl)
  962 {
  963         struct namecache *ncp = nch->ncp;
  964 
  965         if (ncp_shared_lock_disable || excl ||
  966             (ncp->nc_flag & NCF_UNRESOLVED)) {
  967                 _cache_lock(ncp);
  968         } else {
  969                 _cache_lock_shared(ncp);
  970                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
  971                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
  972                                 _cache_unlock(ncp);
  973                                 _cache_lock(ncp);
  974                         }
  975                 } else {
  976                         _cache_unlock(ncp);
  977                         _cache_lock(ncp);
  978                 }
  979         }
  980 }
  981 
  982 /*
  983  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
  984  * is responsible for checking both for validity on return as they
  985  * may have become invalid.
  986  *
  987  * We have to deal with potential deadlocks here, just ping pong
  988  * the lock until we get it (we will always block somewhere when
  989  * looping so this is not cpu-intensive).
  990  *
  991  * which = 0    nch1 not locked, nch2 is locked
  992  * which = 1    nch1 is locked, nch2 is not locked
  993  */
  994 void
  995 cache_relock(struct nchandle *nch1, struct ucred *cred1,
  996              struct nchandle *nch2, struct ucred *cred2)
  997 {
  998         int which;
  999 
 1000         which = 0;
 1001 
 1002         for (;;) {
 1003                 if (which == 0) {
 1004                         if (cache_lock_nonblock(nch1) == 0) {
 1005                                 cache_resolve(nch1, cred1);
 1006                                 break;
 1007                         }
 1008                         cache_unlock(nch2);
 1009                         cache_lock(nch1);
 1010                         cache_resolve(nch1, cred1);
 1011                         which = 1;
 1012                 } else {
 1013                         if (cache_lock_nonblock(nch2) == 0) {
 1014                                 cache_resolve(nch2, cred2);
 1015                                 break;
 1016                         }
 1017                         cache_unlock(nch1);
 1018                         cache_lock(nch2);
 1019                         cache_resolve(nch2, cred2);
 1020                         which = 0;
 1021                 }
 1022         }
 1023 }
 1024 
 1025 int
 1026 cache_lock_nonblock(struct nchandle *nch)
 1027 {
 1028         return(_cache_lock_nonblock(nch->ncp));
 1029 }
 1030 
 1031 void
 1032 cache_unlock(struct nchandle *nch)
 1033 {
 1034         _cache_unlock(nch->ncp);
 1035 }
 1036 
 1037 /*
 1038  * ref-and-lock, unlock-and-deref functions.
 1039  *
 1040  * This function is primarily used by nlookup.  Even though cache_lock
 1041  * holds the vnode, it is possible that the vnode may have already
 1042  * initiated a recyclement.
 1043  *
 1044  * We want cache_get() to return a definitively usable vnode or a
 1045  * definitively unresolved ncp.
 1046  */
 1047 static
 1048 struct namecache *
 1049 _cache_get(struct namecache *ncp)
 1050 {
 1051         _cache_hold(ncp);
 1052         _cache_lock(ncp);
 1053         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 1054                 _cache_setunresolved(ncp);
 1055         return(ncp);
 1056 }
 1057 
 1058 /*
 1059  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
 1060  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
 1061  * valid.  Otherwise an exclusive lock will be acquired instead.
 1062  */
 1063 static
 1064 struct namecache *
 1065 _cache_get_maybe_shared(struct namecache *ncp, int excl)
 1066 {
 1067         if (ncp_shared_lock_disable || excl ||
 1068             (ncp->nc_flag & NCF_UNRESOLVED)) {
 1069                 return(_cache_get(ncp));
 1070         }
 1071         _cache_hold(ncp);
 1072         _cache_lock_shared(ncp);
 1073         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 1074                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
 1075                         _cache_unlock(ncp);
 1076                         ncp = _cache_get(ncp);
 1077                         _cache_drop(ncp);
 1078                 }
 1079         } else {
 1080                 _cache_unlock(ncp);
 1081                 ncp = _cache_get(ncp);
 1082                 _cache_drop(ncp);
 1083         }
 1084         return(ncp);
 1085 }
 1086 
 1087 /*
 1088  * This is a special form of _cache_lock() which only succeeds if
 1089  * it can get a pristine, non-recursive lock.  The caller must have
 1090  * already ref'd the ncp.
 1091  *
 1092  * On success the ncp will be locked, on failure it will not.  The
 1093  * ref count does not change either way.
 1094  *
 1095  * We want _cache_lock_special() (on success) to return a definitively
 1096  * usable vnode or a definitively unresolved ncp.
 1097  */
 1098 static int
 1099 _cache_lock_special(struct namecache *ncp)
 1100 {
 1101         if (_cache_lock_nonblock(ncp) == 0) {
 1102                 if ((ncp->nc_lockstatus &
 1103                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) {
 1104                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 1105                                 _cache_setunresolved(ncp);
 1106                         return(0);
 1107                 }
 1108                 _cache_unlock(ncp);
 1109         }
 1110         return(EWOULDBLOCK);
 1111 }
 1112 
 1113 /*
 1114  * This function tries to get a shared lock but will back-off to an exclusive
 1115  * lock if:
 1116  *
 1117  * (1) Some other thread is trying to obtain an exclusive lock
 1118  *     (to prevent the exclusive requester from getting livelocked out
 1119  *     by many shared locks).
 1120  *
 1121  * (2) The current thread already owns an exclusive lock (to avoid
 1122  *     deadlocking).
 1123  *
 1124  * WARNING! On machines with lots of cores we really want to try hard to
 1125  *          get a shared lock or concurrent path lookups can chain-react
 1126  *          into a very high-latency exclusive lock.
 1127  */
 1128 static int
 1129 _cache_lock_shared_special(struct namecache *ncp)
 1130 {
 1131         /*
 1132          * Only honor a successful shared lock (returning 0) if there is
 1133          * no exclusive request pending and the vnode, if present, is not
 1134          * in a reclaimed state.
 1135          */
 1136         if (_cache_lock_shared_nonblock(ncp) == 0) {
 1137                 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) {
 1138                         if (ncp->nc_vp == NULL ||
 1139                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
 1140                                 return(0);
 1141                         }
 1142                 }
 1143                 _cache_unlock(ncp);
 1144                 return(EWOULDBLOCK);
 1145         }
 1146 
 1147         /*
 1148          * Non-blocking shared lock failed.  If we already own the exclusive
 1149          * lock just acquire another exclusive lock (instead of deadlocking).
 1150          * Otherwise acquire a shared lock.
 1151          */
 1152         if (ncp->nc_locktd == curthread) {
 1153                 _cache_lock(ncp);
 1154                 return(0);
 1155         }
 1156         _cache_lock_shared(ncp);
 1157         return(0);
 1158 }
 1159 
 1160 
 1161 /*
 1162  * NOTE: The same nchandle can be passed for both arguments.
 1163  */
 1164 void
 1165 cache_get(struct nchandle *nch, struct nchandle *target)
 1166 {
 1167         KKASSERT(nch->ncp->nc_refs > 0);
 1168         target->mount = nch->mount;
 1169         target->ncp = _cache_get(nch->ncp);
 1170         atomic_add_int(&target->mount->mnt_refs, 1);
 1171 }
 1172 
 1173 void
 1174 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
 1175 {
 1176         KKASSERT(nch->ncp->nc_refs > 0);
 1177         target->mount = nch->mount;
 1178         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
 1179         atomic_add_int(&target->mount->mnt_refs, 1);
 1180 }
 1181 
 1182 /*
 1183  *
 1184  */
 1185 static __inline
 1186 void
 1187 _cache_put(struct namecache *ncp)
 1188 {
 1189         _cache_unlock(ncp);
 1190         _cache_drop(ncp);
 1191 }
 1192 
 1193 /*
 1194  *
 1195  */
 1196 void
 1197 cache_put(struct nchandle *nch)
 1198 {
 1199         atomic_add_int(&nch->mount->mnt_refs, -1);
 1200         _cache_put(nch->ncp);
 1201         nch->ncp = NULL;
 1202         nch->mount = NULL;
 1203 }
 1204 
 1205 /*
 1206  * Resolve an unresolved ncp by associating a vnode with it.  If the
 1207  * vnode is NULL, a negative cache entry is created.
 1208  *
 1209  * The ncp should be locked on entry and will remain locked on return.
 1210  */
 1211 static
 1212 void
 1213 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
 1214 {
 1215         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
 1216         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
 1217 
 1218         if (vp != NULL) {
 1219                 /*
 1220                  * Any vp associated with an ncp which has children must
 1221                  * be held.  Any vp associated with a locked ncp must be held.
 1222                  */
 1223                 if (!TAILQ_EMPTY(&ncp->nc_list))
 1224                         vhold(vp);
 1225                 spin_lock(&vp->v_spin);
 1226                 ncp->nc_vp = vp;
 1227                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
 1228                 spin_unlock(&vp->v_spin);
 1229                 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
 1230                         vhold(vp);
 1231 
 1232                 /*
 1233                  * Set auxiliary flags
 1234                  */
 1235                 switch(vp->v_type) {
 1236                 case VDIR:
 1237                         ncp->nc_flag |= NCF_ISDIR;
 1238                         break;
 1239                 case VLNK:
 1240                         ncp->nc_flag |= NCF_ISSYMLINK;
 1241                         /* XXX cache the contents of the symlink */
 1242                         break;
 1243                 default:
 1244                         break;
 1245                 }
 1246                 atomic_add_int(&numcache, 1);
 1247                 ncp->nc_error = 0;
 1248                 /* XXX: this is a hack to work-around the lack of a real pfs vfs
 1249                  * implementation*/
 1250                 if (mp != NULL)
 1251                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
 1252                                 vp->v_pfsmp = mp;
 1253         } else {
 1254                 /*
 1255                  * When creating a negative cache hit we set the
 1256                  * namecache_gen.  A later resolve will clean out the
 1257                  * negative cache hit if the mount point's namecache_gen
 1258                  * has changed.  Used by devfs, could also be used by
 1259                  * other remote FSs.
 1260                  */
 1261                 ncp->nc_vp = NULL;
 1262                 spin_lock(&ncspin);
 1263                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
 1264                 ++numneg;
 1265                 spin_unlock(&ncspin);
 1266                 ncp->nc_error = ENOENT;
 1267                 if (mp)
 1268                         VFS_NCPGEN_SET(mp, ncp);
 1269         }
 1270         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
 1271 }
 1272 
 1273 /*
 1274  *
 1275  */
 1276 void
 1277 cache_setvp(struct nchandle *nch, struct vnode *vp)
 1278 {
 1279         _cache_setvp(nch->mount, nch->ncp, vp);
 1280 }
 1281 
 1282 /*
 1283  *
 1284  */
 1285 void
 1286 cache_settimeout(struct nchandle *nch, int nticks)
 1287 {
 1288         struct namecache *ncp = nch->ncp;
 1289 
 1290         if ((ncp->nc_timeout = ticks + nticks) == 0)
 1291                 ncp->nc_timeout = 1;
 1292 }
 1293 
 1294 /*
 1295  * Disassociate the vnode or negative-cache association and mark a
 1296  * namecache entry as unresolved again.  Note that the ncp is still
 1297  * left in the hash table and still linked to its parent.
 1298  *
 1299  * The ncp should be locked and refd on entry and will remain locked and refd
 1300  * on return.
 1301  *
 1302  * This routine is normally never called on a directory containing children.
 1303  * However, NFS often does just that in its rename() code as a cop-out to
 1304  * avoid complex namespace operations.  This disconnects a directory vnode
 1305  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
 1306  * sync.
 1307  *
 1308  */
 1309 static
 1310 void
 1311 _cache_setunresolved(struct namecache *ncp)
 1312 {
 1313         struct vnode *vp;
 1314 
 1315         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 1316                 ncp->nc_flag |= NCF_UNRESOLVED;
 1317                 ncp->nc_timeout = 0;
 1318                 ncp->nc_error = ENOTCONN;
 1319                 if ((vp = ncp->nc_vp) != NULL) {
 1320                         atomic_add_int(&numcache, -1);
 1321                         spin_lock(&vp->v_spin);
 1322                         ncp->nc_vp = NULL;
 1323                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
 1324                         spin_unlock(&vp->v_spin);
 1325 
 1326                         /*
 1327                          * Any vp associated with an ncp with children is
 1328                          * held by that ncp.  Any vp associated with a locked
 1329                          * ncp is held by that ncp.  These conditions must be
 1330                          * undone when the vp is cleared out from the ncp.
 1331                          */
 1332                         if (!TAILQ_EMPTY(&ncp->nc_list))
 1333                                 vdrop(vp);
 1334                         if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
 1335                                 vdrop(vp);
 1336                 } else {
 1337                         spin_lock(&ncspin);
 1338                         TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
 1339                         --numneg;
 1340                         spin_unlock(&ncspin);
 1341                 }
 1342                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
 1343         }
 1344 }
 1345 
 1346 /*
 1347  * The cache_nresolve() code calls this function to automatically
 1348  * set a resolved cache element to unresolved if it has timed out
 1349  * or if it is a negative cache hit and the mount point namecache_gen
 1350  * has changed.
 1351  */
 1352 static __inline int
 1353 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
 1354 {
 1355         /*
 1356          * Try to zap entries that have timed out.  We have
 1357          * to be careful here because locked leafs may depend
 1358          * on the vnode remaining intact in a parent, so only
 1359          * do this under very specific conditions.
 1360          */
 1361         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
 1362             TAILQ_EMPTY(&ncp->nc_list)) {
 1363                 return 1;
 1364         }
 1365 
 1366         /*
 1367          * If a resolved negative cache hit is invalid due to
 1368          * the mount's namecache generation being bumped, zap it.
 1369          */
 1370         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
 1371                 return 1;
 1372         }
 1373 
 1374         /*
 1375          * Otherwise we are good
 1376          */
 1377         return 0;
 1378 }
 1379 
 1380 static __inline void
 1381 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
 1382 {
 1383         /*
 1384          * Already in an unresolved state, nothing to do.
 1385          */
 1386         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 1387                 if (_cache_auto_unresolve_test(mp, ncp))
 1388                         _cache_setunresolved(ncp);
 1389         }
 1390 }
 1391 
 1392 /*
 1393  *
 1394  */
 1395 void
 1396 cache_setunresolved(struct nchandle *nch)
 1397 {
 1398         _cache_setunresolved(nch->ncp);
 1399 }
 1400 
 1401 /*
 1402  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
 1403  * looking for matches.  This flag tells the lookup code when it must
 1404  * check for a mount linkage and also prevents the directories in question
 1405  * from being deleted or renamed.
 1406  */
 1407 static
 1408 int
 1409 cache_clrmountpt_callback(struct mount *mp, void *data)
 1410 {
 1411         struct nchandle *nch = data;
 1412 
 1413         if (mp->mnt_ncmounton.ncp == nch->ncp)
 1414                 return(1);
 1415         if (mp->mnt_ncmountpt.ncp == nch->ncp)
 1416                 return(1);
 1417         return(0);
 1418 }
 1419 
 1420 /*
 1421  *
 1422  */
 1423 void
 1424 cache_clrmountpt(struct nchandle *nch)
 1425 {
 1426         int count;
 1427 
 1428         count = mountlist_scan(cache_clrmountpt_callback, nch,
 1429                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
 1430         if (count == 0)
 1431                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
 1432 }
 1433 
 1434 /*
 1435  * Invalidate portions of the namecache topology given a starting entry.
 1436  * The passed ncp is set to an unresolved state and:
 1437  *
 1438  * The passed ncp must be referencxed and locked.  The routine may unlock
 1439  * and relock ncp several times, and will recheck the children and loop
 1440  * to catch races.  When done the passed ncp will be returned with the
 1441  * reference and lock intact.
 1442  *
 1443  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
 1444  *                        that the physical underlying nodes have been 
 1445  *                        destroyed... as in deleted.  For example, when
 1446  *                        a directory is removed.  This will cause record
 1447  *                        lookups on the name to no longer be able to find
 1448  *                        the record and tells the resolver to return failure
 1449  *                        rather then trying to resolve through the parent.
 1450  *
 1451  *                        The topology itself, including ncp->nc_name,
 1452  *                        remains intact.
 1453  *
 1454  *                        This only applies to the passed ncp, if CINV_CHILDREN
 1455  *                        is specified the children are not flagged.
 1456  *
 1457  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
 1458  *                        state as well.
 1459  *
 1460  *                        Note that this will also have the side effect of
 1461  *                        cleaning out any unreferenced nodes in the topology
 1462  *                        from the leaves up as the recursion backs out.
 1463  *
 1464  * Note that the topology for any referenced nodes remains intact, but
 1465  * the nodes will be marked as having been destroyed and will be set
 1466  * to an unresolved state.
 1467  *
 1468  * It is possible for cache_inval() to race a cache_resolve(), meaning that
 1469  * the namecache entry may not actually be invalidated on return if it was
 1470  * revalidated while recursing down into its children.  This code guarentees
 1471  * that the node(s) will go through an invalidation cycle, but does not 
 1472  * guarentee that they will remain in an invalidated state. 
 1473  *
 1474  * Returns non-zero if a revalidation was detected during the invalidation
 1475  * recursion, zero otherwise.  Note that since only the original ncp is
 1476  * locked the revalidation ultimately can only indicate that the original ncp
 1477  * *MIGHT* no have been reresolved.
 1478  *
 1479  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
 1480  * have to avoid blowing out the kernel stack.  We do this by saving the
 1481  * deep namecache node and aborting the recursion, then re-recursing at that
 1482  * node using a depth-first algorithm in order to allow multiple deep
 1483  * recursions to chain through each other, then we restart the invalidation
 1484  * from scratch.
 1485  */
 1486 
 1487 struct cinvtrack {
 1488         struct namecache *resume_ncp;
 1489         int depth;
 1490 };
 1491 
 1492 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
 1493 
 1494 static
 1495 int
 1496 _cache_inval(struct namecache *ncp, int flags)
 1497 {
 1498         struct cinvtrack track;
 1499         struct namecache *ncp2;
 1500         int r;
 1501 
 1502         track.depth = 0;
 1503         track.resume_ncp = NULL;
 1504 
 1505         for (;;) {
 1506                 r = _cache_inval_internal(ncp, flags, &track);
 1507                 if (track.resume_ncp == NULL)
 1508                         break;
 1509                 kprintf("Warning: deep namecache recursion at %s\n",
 1510                         ncp->nc_name);
 1511                 _cache_unlock(ncp);
 1512                 while ((ncp2 = track.resume_ncp) != NULL) {
 1513                         track.resume_ncp = NULL;
 1514                         _cache_lock(ncp2);
 1515                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
 1516                                              &track);
 1517                         _cache_put(ncp2);
 1518                 }
 1519                 _cache_lock(ncp);
 1520         }
 1521         return(r);
 1522 }
 1523 
 1524 int
 1525 cache_inval(struct nchandle *nch, int flags)
 1526 {
 1527         return(_cache_inval(nch->ncp, flags));
 1528 }
 1529 
 1530 /*
 1531  * Helper for _cache_inval().  The passed ncp is refd and locked and
 1532  * remains that way on return, but may be unlocked/relocked multiple
 1533  * times by the routine.
 1534  */
 1535 static int
 1536 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
 1537 {
 1538         struct namecache *kid;
 1539         struct namecache *nextkid;
 1540         int rcnt = 0;
 1541 
 1542         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
 1543 
 1544         _cache_setunresolved(ncp);
 1545         if (flags & CINV_DESTROY)
 1546                 ncp->nc_flag |= NCF_DESTROYED;
 1547         if ((flags & CINV_CHILDREN) && 
 1548             (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
 1549         ) {
 1550                 _cache_hold(kid);
 1551                 if (++track->depth > MAX_RECURSION_DEPTH) {
 1552                         track->resume_ncp = ncp;
 1553                         _cache_hold(ncp);
 1554                         ++rcnt;
 1555                 }
 1556                 _cache_unlock(ncp);
 1557                 while (kid) {
 1558                         if (track->resume_ncp) {
 1559                                 _cache_drop(kid);
 1560                                 break;
 1561                         }
 1562                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
 1563                                 _cache_hold(nextkid);
 1564                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
 1565                             TAILQ_FIRST(&kid->nc_list)
 1566                         ) {
 1567                                 _cache_lock(kid);
 1568                                 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
 1569                                 _cache_unlock(kid);
 1570                         }
 1571                         _cache_drop(kid);
 1572                         kid = nextkid;
 1573                 }
 1574                 --track->depth;
 1575                 _cache_lock(ncp);
 1576         }
 1577 
 1578         /*
 1579          * Someone could have gotten in there while ncp was unlocked,
 1580          * retry if so.
 1581          */
 1582         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
 1583                 ++rcnt;
 1584         return (rcnt);
 1585 }
 1586 
 1587 /*
 1588  * Invalidate a vnode's namecache associations.  To avoid races against
 1589  * the resolver we do not invalidate a node which we previously invalidated
 1590  * but which was then re-resolved while we were in the invalidation loop.
 1591  *
 1592  * Returns non-zero if any namecache entries remain after the invalidation
 1593  * loop completed.
 1594  *
 1595  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
 1596  *       be ripped out of the topology while held, the vnode's v_namecache
 1597  *       list has no such restriction.  NCP's can be ripped out of the list
 1598  *       at virtually any time if not locked, even if held.
 1599  *
 1600  *       In addition, the v_namecache list itself must be locked via
 1601  *       the vnode's spinlock.
 1602  */
 1603 int
 1604 cache_inval_vp(struct vnode *vp, int flags)
 1605 {
 1606         struct namecache *ncp;
 1607         struct namecache *next;
 1608 
 1609 restart:
 1610         spin_lock(&vp->v_spin);
 1611         ncp = TAILQ_FIRST(&vp->v_namecache);
 1612         if (ncp)
 1613                 _cache_hold(ncp);
 1614         while (ncp) {
 1615                 /* loop entered with ncp held and vp spin-locked */
 1616                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
 1617                         _cache_hold(next);
 1618                 spin_unlock(&vp->v_spin);
 1619                 _cache_lock(ncp);
 1620                 if (ncp->nc_vp != vp) {
 1621                         kprintf("Warning: cache_inval_vp: race-A detected on "
 1622                                 "%s\n", ncp->nc_name);
 1623                         _cache_put(ncp);
 1624                         if (next)
 1625                                 _cache_drop(next);
 1626                         goto restart;
 1627                 }
 1628                 _cache_inval(ncp, flags);
 1629                 _cache_put(ncp);                /* also releases reference */
 1630                 ncp = next;
 1631                 spin_lock(&vp->v_spin);
 1632                 if (ncp && ncp->nc_vp != vp) {
 1633                         spin_unlock(&vp->v_spin);
 1634                         kprintf("Warning: cache_inval_vp: race-B detected on "
 1635                                 "%s\n", ncp->nc_name);
 1636                         _cache_drop(ncp);
 1637                         goto restart;
 1638                 }
 1639         }
 1640         spin_unlock(&vp->v_spin);
 1641         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
 1642 }
 1643 
 1644 /*
 1645  * This routine is used instead of the normal cache_inval_vp() when we
 1646  * are trying to recycle otherwise good vnodes.
 1647  *
 1648  * Return 0 on success, non-zero if not all namecache records could be
 1649  * disassociated from the vnode (for various reasons).
 1650  */
 1651 int
 1652 cache_inval_vp_nonblock(struct vnode *vp)
 1653 {
 1654         struct namecache *ncp;
 1655         struct namecache *next;
 1656 
 1657         spin_lock(&vp->v_spin);
 1658         ncp = TAILQ_FIRST(&vp->v_namecache);
 1659         if (ncp)
 1660                 _cache_hold(ncp);
 1661         while (ncp) {
 1662                 /* loop entered with ncp held */
 1663                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
 1664                         _cache_hold(next);
 1665                 spin_unlock(&vp->v_spin);
 1666                 if (_cache_lock_nonblock(ncp)) {
 1667                         _cache_drop(ncp);
 1668                         if (next)
 1669                                 _cache_drop(next);
 1670                         goto done;
 1671                 }
 1672                 if (ncp->nc_vp != vp) {
 1673                         kprintf("Warning: cache_inval_vp: race-A detected on "
 1674                                 "%s\n", ncp->nc_name);
 1675                         _cache_put(ncp);
 1676                         if (next)
 1677                                 _cache_drop(next);
 1678                         goto done;
 1679                 }
 1680                 _cache_inval(ncp, 0);
 1681                 _cache_put(ncp);                /* also releases reference */
 1682                 ncp = next;
 1683                 spin_lock(&vp->v_spin);
 1684                 if (ncp && ncp->nc_vp != vp) {
 1685                         spin_unlock(&vp->v_spin);
 1686                         kprintf("Warning: cache_inval_vp: race-B detected on "
 1687                                 "%s\n", ncp->nc_name);
 1688                         _cache_drop(ncp);
 1689                         goto done;
 1690                 }
 1691         }
 1692         spin_unlock(&vp->v_spin);
 1693 done:
 1694         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
 1695 }
 1696 
 1697 /*
 1698  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
 1699  * must be locked.  The target ncp is destroyed (as a normal rename-over
 1700  * would destroy the target file or directory).
 1701  *
 1702  * Because there may be references to the source ncp we cannot copy its
 1703  * contents to the target.  Instead the source ncp is relinked as the target
 1704  * and the target ncp is removed from the namecache topology.
 1705  */
 1706 void
 1707 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
 1708 {
 1709         struct namecache *fncp = fnch->ncp;
 1710         struct namecache *tncp = tnch->ncp;
 1711         struct namecache *tncp_par;
 1712         struct nchash_head *nchpp;
 1713         u_int32_t hash;
 1714         char *oname;
 1715         char *nname;
 1716 
 1717         if (tncp->nc_nlen) {
 1718                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
 1719                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
 1720                 nname[tncp->nc_nlen] = 0;
 1721         } else {
 1722                 nname = NULL;
 1723         }
 1724 
 1725         /*
 1726          * Rename fncp (unlink)
 1727          */
 1728         _cache_unlink_parent(fncp);
 1729         oname = fncp->nc_name;
 1730         fncp->nc_name = nname;
 1731         fncp->nc_nlen = tncp->nc_nlen;
 1732         if (oname)
 1733                 kfree(oname, M_VFSCACHE);
 1734 
 1735         tncp_par = tncp->nc_parent;
 1736         _cache_hold(tncp_par);
 1737         _cache_lock(tncp_par);
 1738 
 1739         /*
 1740          * Rename fncp (relink)
 1741          */
 1742         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
 1743         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
 1744         nchpp = NCHHASH(hash);
 1745 
 1746         spin_lock(&nchpp->spin);
 1747         _cache_link_parent(fncp, tncp_par, nchpp);
 1748         spin_unlock(&nchpp->spin);
 1749 
 1750         _cache_put(tncp_par);
 1751 
 1752         /*
 1753          * Get rid of the overwritten tncp (unlink)
 1754          */
 1755         _cache_unlink(tncp);
 1756 }
 1757 
 1758 /*
 1759  * Perform actions consistent with unlinking a file.  The passed-in ncp
 1760  * must be locked.
 1761  *
 1762  * The ncp is marked DESTROYED so it no longer shows up in searches,
 1763  * and will be physically deleted when the vnode goes away.
 1764  *
 1765  * If the related vnode has no refs then we cycle it through vget()/vput()
 1766  * to (possibly if we don't have a ref race) trigger a deactivation,
 1767  * allowing the VFS to trivially detect and recycle the deleted vnode
 1768  * via VOP_INACTIVE().
 1769  *
 1770  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
 1771  *       target ncp.
 1772  */
 1773 void
 1774 cache_unlink(struct nchandle *nch)
 1775 {
 1776         _cache_unlink(nch->ncp);
 1777 }
 1778 
 1779 static void
 1780 _cache_unlink(struct namecache *ncp)
 1781 {
 1782         struct vnode *vp;
 1783 
 1784         /*
 1785          * Causes lookups to fail and allows another ncp with the same
 1786          * name to be created under ncp->nc_parent.
 1787          */
 1788         ncp->nc_flag |= NCF_DESTROYED;
 1789 
 1790         /*
 1791          * Attempt to trigger a deactivation.  Set VAUX_FINALIZE to
 1792          * force action on the 1->0 transition.
 1793          */
 1794         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
 1795             (vp = ncp->nc_vp) != NULL) {
 1796                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 1797                 if (VREFCNT(vp) <= 0) {
 1798                         if (vget(vp, LK_SHARED) == 0)
 1799                                 vput(vp);
 1800                 }
 1801         }
 1802 }
 1803 
 1804 /*
 1805  * vget the vnode associated with the namecache entry.  Resolve the namecache
 1806  * entry if necessary.  The passed ncp must be referenced and locked.  If
 1807  * the ncp is resolved it might be locked shared.
 1808  *
 1809  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
 1810  * (depending on the passed lk_type) will be returned in *vpp with an error
 1811  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
 1812  * most typical error is ENOENT, meaning that the ncp represents a negative
 1813  * cache hit and there is no vnode to retrieve, but other errors can occur
 1814  * too.
 1815  *
 1816  * The vget() can race a reclaim.  If this occurs we re-resolve the
 1817  * namecache entry.
 1818  *
 1819  * There are numerous places in the kernel where vget() is called on a
 1820  * vnode while one or more of its namecache entries is locked.  Releasing
 1821  * a vnode never deadlocks against locked namecache entries (the vnode
 1822  * will not get recycled while referenced ncp's exist).  This means we
 1823  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
 1824  * lock when acquiring the vp lock or we might cause a deadlock.
 1825  *
 1826  * NOTE: The passed-in ncp must be locked exclusively if it is initially
 1827  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
 1828  *       relocked exclusively before being re-resolved.
 1829  */
 1830 int
 1831 cache_vget(struct nchandle *nch, struct ucred *cred,
 1832            int lk_type, struct vnode **vpp)
 1833 {
 1834         struct namecache *ncp;
 1835         struct vnode *vp;
 1836         int error;
 1837 
 1838         ncp = nch->ncp;
 1839 again:
 1840         vp = NULL;
 1841         if (ncp->nc_flag & NCF_UNRESOLVED)
 1842                 error = cache_resolve(nch, cred);
 1843         else
 1844                 error = 0;
 1845 
 1846         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
 1847                 error = vget(vp, lk_type);
 1848                 if (error) {
 1849                         /*
 1850                          * VRECLAIM race
 1851                          *
 1852                          * The ncp may have been locked shared, we must relock
 1853                          * it exclusively before we can set it to unresolved.
 1854                          */
 1855                         if (error == ENOENT) {
 1856                                 kprintf("Warning: vnode reclaim race detected "
 1857                                         "in cache_vget on %p (%s)\n",
 1858                                         vp, ncp->nc_name);
 1859                                 _cache_unlock(ncp);
 1860                                 _cache_lock(ncp);
 1861                                 _cache_setunresolved(ncp);
 1862                                 goto again;
 1863                         }
 1864 
 1865                         /*
 1866                          * Not a reclaim race, some other error.
 1867                          */
 1868                         KKASSERT(ncp->nc_vp == vp);
 1869                         vp = NULL;
 1870                 } else {
 1871                         KKASSERT(ncp->nc_vp == vp);
 1872                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
 1873                 }
 1874         }
 1875         if (error == 0 && vp == NULL)
 1876                 error = ENOENT;
 1877         *vpp = vp;
 1878         return(error);
 1879 }
 1880 
 1881 /*
 1882  * Similar to cache_vget() but only acquires a ref on the vnode.
 1883  *
 1884  * NOTE: The passed-in ncp must be locked exclusively if it is initially
 1885  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
 1886  *       relocked exclusively before being re-resolved.
 1887  */
 1888 int
 1889 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
 1890 {
 1891         struct namecache *ncp;
 1892         struct vnode *vp;
 1893         int error;
 1894 
 1895         ncp = nch->ncp;
 1896 again:
 1897         vp = NULL;
 1898         if (ncp->nc_flag & NCF_UNRESOLVED)
 1899                 error = cache_resolve(nch, cred);
 1900         else
 1901                 error = 0;
 1902 
 1903         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
 1904                 error = vget(vp, LK_SHARED);
 1905                 if (error) {
 1906                         /*
 1907                          * VRECLAIM race
 1908                          */
 1909                         if (error == ENOENT) {
 1910                                 kprintf("Warning: vnode reclaim race detected "
 1911                                         "in cache_vget on %p (%s)\n",
 1912                                         vp, ncp->nc_name);
 1913                                 _cache_unlock(ncp);
 1914                                 _cache_lock(ncp);
 1915                                 _cache_setunresolved(ncp);
 1916                                 goto again;
 1917                         }
 1918 
 1919                         /*
 1920                          * Not a reclaim race, some other error.
 1921                          */
 1922                         KKASSERT(ncp->nc_vp == vp);
 1923                         vp = NULL;
 1924                 } else {
 1925                         KKASSERT(ncp->nc_vp == vp);
 1926                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
 1927                         /* caller does not want a lock */
 1928                         vn_unlock(vp);
 1929                 }
 1930         }
 1931         if (error == 0 && vp == NULL)
 1932                 error = ENOENT;
 1933         *vpp = vp;
 1934         return(error);
 1935 }
 1936 
 1937 /*
 1938  * Return a referenced vnode representing the parent directory of
 1939  * ncp.
 1940  *
 1941  * Because the caller has locked the ncp it should not be possible for
 1942  * the parent ncp to go away.  However, the parent can unresolve its
 1943  * dvp at any time so we must be able to acquire a lock on the parent
 1944  * to safely access nc_vp.
 1945  *
 1946  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
 1947  * so use vhold()/vdrop() while holding the lock to prevent dvp from
 1948  * getting destroyed.
 1949  *
 1950  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
 1951  *       lock on the ncp in question..
 1952  */
 1953 static struct vnode *
 1954 cache_dvpref(struct namecache *ncp)
 1955 {
 1956         struct namecache *par;
 1957         struct vnode *dvp;
 1958 
 1959         dvp = NULL;
 1960         if ((par = ncp->nc_parent) != NULL) {
 1961                 _cache_hold(par);
 1962                 _cache_lock(par);
 1963                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
 1964                         if ((dvp = par->nc_vp) != NULL)
 1965                                 vhold(dvp);
 1966                 }
 1967                 _cache_unlock(par);
 1968                 if (dvp) {
 1969                         if (vget(dvp, LK_SHARED) == 0) {
 1970                                 vn_unlock(dvp);
 1971                                 vdrop(dvp);
 1972                                 /* return refd, unlocked dvp */
 1973                         } else {
 1974                                 vdrop(dvp);
 1975                                 dvp = NULL;
 1976                         }
 1977                 }
 1978                 _cache_drop(par);
 1979         }
 1980         return(dvp);
 1981 }
 1982 
 1983 /*
 1984  * Convert a directory vnode to a namecache record without any other 
 1985  * knowledge of the topology.  This ONLY works with directory vnodes and
 1986  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
 1987  * returned ncp (if not NULL) will be held and unlocked.
 1988  *
 1989  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
 1990  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
 1991  * for dvp.  This will fail only if the directory has been deleted out from
 1992  * under the caller.  
 1993  *
 1994  * Callers must always check for a NULL return no matter the value of 'makeit'.
 1995  *
 1996  * To avoid underflowing the kernel stack each recursive call increments
 1997  * the makeit variable.
 1998  */
 1999 
 2000 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
 2001                                   struct vnode *dvp, char *fakename);
 2002 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 
 2003                                   struct vnode **saved_dvp);
 2004 
 2005 int
 2006 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
 2007               struct nchandle *nch)
 2008 {
 2009         struct vnode *saved_dvp;
 2010         struct vnode *pvp;
 2011         char *fakename;
 2012         int error;
 2013 
 2014         nch->ncp = NULL;
 2015         nch->mount = dvp->v_mount;
 2016         saved_dvp = NULL;
 2017         fakename = NULL;
 2018 
 2019         /*
 2020          * Handle the makeit == 0 degenerate case
 2021          */
 2022         if (makeit == 0) {
 2023                 spin_lock_shared(&dvp->v_spin);
 2024                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
 2025                 if (nch->ncp)
 2026                         cache_hold(nch);
 2027                 spin_unlock_shared(&dvp->v_spin);
 2028         }
 2029 
 2030         /*
 2031          * Loop until resolution, inside code will break out on error.
 2032          */
 2033         while (makeit) {
 2034                 /*
 2035                  * Break out if we successfully acquire a working ncp.
 2036                  */
 2037                 spin_lock_shared(&dvp->v_spin);
 2038                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
 2039                 if (nch->ncp) {
 2040                         cache_hold(nch);
 2041                         spin_unlock_shared(&dvp->v_spin);
 2042                         break;
 2043                 }
 2044                 spin_unlock_shared(&dvp->v_spin);
 2045 
 2046                 /*
 2047                  * If dvp is the root of its filesystem it should already
 2048                  * have a namecache pointer associated with it as a side 
 2049                  * effect of the mount, but it may have been disassociated.
 2050                  */
 2051                 if (dvp->v_flag & VROOT) {
 2052                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
 2053                         error = cache_resolve_mp(nch->mount);
 2054                         _cache_put(nch->ncp);
 2055                         if (ncvp_debug) {
 2056                                 kprintf("cache_fromdvp: resolve root of mount %p error %d", 
 2057                                         dvp->v_mount, error);
 2058                         }
 2059                         if (error) {
 2060                                 if (ncvp_debug)
 2061                                         kprintf(" failed\n");
 2062                                 nch->ncp = NULL;
 2063                                 break;
 2064                         }
 2065                         if (ncvp_debug)
 2066                                 kprintf(" succeeded\n");
 2067                         continue;
 2068                 }
 2069 
 2070                 /*
 2071                  * If we are recursed too deeply resort to an O(n^2)
 2072                  * algorithm to resolve the namecache topology.  The
 2073                  * resolved pvp is left referenced in saved_dvp to
 2074                  * prevent the tree from being destroyed while we loop.
 2075                  */
 2076                 if (makeit > 20) {
 2077                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
 2078                         if (error) {
 2079                                 kprintf("lookupdotdot(longpath) failed %d "
 2080                                        "dvp %p\n", error, dvp);
 2081                                 nch->ncp = NULL;
 2082                                 break;
 2083                         }
 2084                         continue;
 2085                 }
 2086 
 2087                 /*
 2088                  * Get the parent directory and resolve its ncp.
 2089                  */
 2090                 if (fakename) {
 2091                         kfree(fakename, M_TEMP);
 2092                         fakename = NULL;
 2093                 }
 2094                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
 2095                                           &fakename);
 2096                 if (error) {
 2097                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
 2098                         break;
 2099                 }
 2100                 vn_unlock(pvp);
 2101 
 2102                 /*
 2103                  * Reuse makeit as a recursion depth counter.  On success
 2104                  * nch will be fully referenced.
 2105                  */
 2106                 cache_fromdvp(pvp, cred, makeit + 1, nch);
 2107                 vrele(pvp);
 2108                 if (nch->ncp == NULL)
 2109                         break;
 2110 
 2111                 /*
 2112                  * Do an inefficient scan of pvp (embodied by ncp) to look
 2113                  * for dvp.  This will create a namecache record for dvp on
 2114                  * success.  We loop up to recheck on success.
 2115                  *
 2116                  * ncp and dvp are both held but not locked.
 2117                  */
 2118                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
 2119                 if (error) {
 2120                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
 2121                                 pvp, nch->ncp->nc_name, dvp);
 2122                         cache_drop(nch);
 2123                         /* nch was NULLed out, reload mount */
 2124                         nch->mount = dvp->v_mount;
 2125                         break;
 2126                 }
 2127                 if (ncvp_debug) {
 2128                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
 2129                                 pvp, nch->ncp->nc_name);
 2130                 }
 2131                 cache_drop(nch);
 2132                 /* nch was NULLed out, reload mount */
 2133                 nch->mount = dvp->v_mount;
 2134         }
 2135 
 2136         /*
 2137          * If nch->ncp is non-NULL it will have been held already.
 2138          */
 2139         if (fakename)
 2140                 kfree(fakename, M_TEMP);
 2141         if (saved_dvp)
 2142                 vrele(saved_dvp);
 2143         if (nch->ncp)
 2144                 return (0);
 2145         return (EINVAL);
 2146 }
 2147 
 2148 /*
 2149  * Go up the chain of parent directories until we find something
 2150  * we can resolve into the namecache.  This is very inefficient.
 2151  */
 2152 static
 2153 int
 2154 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
 2155                   struct vnode **saved_dvp)
 2156 {
 2157         struct nchandle nch;
 2158         struct vnode *pvp;
 2159         int error;
 2160         static time_t last_fromdvp_report;
 2161         char *fakename;
 2162 
 2163         /*
 2164          * Loop getting the parent directory vnode until we get something we
 2165          * can resolve in the namecache.
 2166          */
 2167         vref(dvp);
 2168         nch.mount = dvp->v_mount;
 2169         nch.ncp = NULL;
 2170         fakename = NULL;
 2171 
 2172         for (;;) {
 2173                 if (fakename) {
 2174                         kfree(fakename, M_TEMP);
 2175                         fakename = NULL;
 2176                 }
 2177                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
 2178                                           &fakename);
 2179                 if (error) {
 2180                         vrele(dvp);
 2181                         break;
 2182                 }
 2183                 vn_unlock(pvp);
 2184                 spin_lock_shared(&pvp->v_spin);
 2185                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
 2186                         _cache_hold(nch.ncp);
 2187                         spin_unlock_shared(&pvp->v_spin);
 2188                         vrele(pvp);
 2189                         break;
 2190                 }
 2191                 spin_unlock_shared(&pvp->v_spin);
 2192                 if (pvp->v_flag & VROOT) {
 2193                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
 2194                         error = cache_resolve_mp(nch.mount);
 2195                         _cache_unlock(nch.ncp);
 2196                         vrele(pvp);
 2197                         if (error) {
 2198                                 _cache_drop(nch.ncp);
 2199                                 nch.ncp = NULL;
 2200                                 vrele(dvp);
 2201                         }
 2202                         break;
 2203                 }
 2204                 vrele(dvp);
 2205                 dvp = pvp;
 2206         }
 2207         if (error == 0) {
 2208                 if (last_fromdvp_report != time_uptime) {
 2209                         last_fromdvp_report = time_uptime;
 2210                         kprintf("Warning: extremely inefficient path "
 2211                                 "resolution on %s\n",
 2212                                 nch.ncp->nc_name);
 2213                 }
 2214                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
 2215 
 2216                 /*
 2217                  * Hopefully dvp now has a namecache record associated with
 2218                  * it.  Leave it referenced to prevent the kernel from
 2219                  * recycling the vnode.  Otherwise extremely long directory
 2220                  * paths could result in endless recycling.
 2221                  */
 2222                 if (*saved_dvp)
 2223                     vrele(*saved_dvp);
 2224                 *saved_dvp = dvp;
 2225                 _cache_drop(nch.ncp);
 2226         }
 2227         if (fakename)
 2228                 kfree(fakename, M_TEMP);
 2229         return (error);
 2230 }
 2231 
 2232 /*
 2233  * Do an inefficient scan of the directory represented by ncp looking for
 2234  * the directory vnode dvp.  ncp must be held but not locked on entry and
 2235  * will be held on return.  dvp must be refd but not locked on entry and
 2236  * will remain refd on return.
 2237  *
 2238  * Why do this at all?  Well, due to its stateless nature the NFS server
 2239  * converts file handles directly to vnodes without necessarily going through
 2240  * the namecache ops that would otherwise create the namecache topology
 2241  * leading to the vnode.  We could either (1) Change the namecache algorithms
 2242  * to allow disconnect namecache records that are re-merged opportunistically,
 2243  * or (2) Make the NFS server backtrack and scan to recover a connected
 2244  * namecache topology in order to then be able to issue new API lookups.
 2245  *
 2246  * It turns out that (1) is a huge mess.  It takes a nice clean set of 
 2247  * namecache algorithms and introduces a lot of complication in every subsystem
 2248  * that calls into the namecache to deal with the re-merge case, especially
 2249  * since we are using the namecache to placehold negative lookups and the
 2250  * vnode might not be immediately assigned. (2) is certainly far less
 2251  * efficient then (1), but since we are only talking about directories here
 2252  * (which are likely to remain cached), the case does not actually run all
 2253  * that often and has the supreme advantage of not polluting the namecache
 2254  * algorithms.
 2255  *
 2256  * If a fakename is supplied just construct a namecache entry using the
 2257  * fake name.
 2258  */
 2259 static int
 2260 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 
 2261                        struct vnode *dvp, char *fakename)
 2262 {
 2263         struct nlcomponent nlc;
 2264         struct nchandle rncp;
 2265         struct dirent *den;
 2266         struct vnode *pvp;
 2267         struct vattr vat;
 2268         struct iovec iov;
 2269         struct uio uio;
 2270         int blksize;
 2271         int eofflag;
 2272         int bytes;
 2273         char *rbuf;
 2274         int error;
 2275 
 2276         vat.va_blocksize = 0;
 2277         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
 2278                 return (error);
 2279         cache_lock(nch);
 2280         error = cache_vref(nch, cred, &pvp);
 2281         cache_unlock(nch);
 2282         if (error)
 2283                 return (error);
 2284         if (ncvp_debug) {
 2285                 kprintf("inefficient_scan: directory iosize %ld "
 2286                         "vattr fileid = %lld\n",
 2287                         vat.va_blocksize,
 2288                         (long long)vat.va_fileid);
 2289         }
 2290 
 2291         /*
 2292          * Use the supplied fakename if not NULL.  Fake names are typically
 2293          * not in the actual filesystem hierarchy.  This is used by HAMMER
 2294          * to glue @@timestamp recursions together.
 2295          */
 2296         if (fakename) {
 2297                 nlc.nlc_nameptr = fakename;
 2298                 nlc.nlc_namelen = strlen(fakename);
 2299                 rncp = cache_nlookup(nch, &nlc);
 2300                 goto done;
 2301         }
 2302 
 2303         if ((blksize = vat.va_blocksize) == 0)
 2304                 blksize = DEV_BSIZE;
 2305         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
 2306         rncp.ncp = NULL;
 2307 
 2308         eofflag = 0;
 2309         uio.uio_offset = 0;
 2310 again:
 2311         iov.iov_base = rbuf;
 2312         iov.iov_len = blksize;
 2313         uio.uio_iov = &iov;
 2314         uio.uio_iovcnt = 1;
 2315         uio.uio_resid = blksize;
 2316         uio.uio_segflg = UIO_SYSSPACE;
 2317         uio.uio_rw = UIO_READ;
 2318         uio.uio_td = curthread;
 2319 
 2320         if (ncvp_debug >= 2)
 2321                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
 2322         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
 2323         if (error == 0) {
 2324                 den = (struct dirent *)rbuf;
 2325                 bytes = blksize - uio.uio_resid;
 2326 
 2327                 while (bytes > 0) {
 2328                         if (ncvp_debug >= 2) {
 2329                                 kprintf("cache_inefficient_scan: %*.*s\n",
 2330                                         den->d_namlen, den->d_namlen, 
 2331                                         den->d_name);
 2332                         }
 2333                         if (den->d_type != DT_WHT &&
 2334                             den->d_ino == vat.va_fileid) {
 2335                                 if (ncvp_debug) {
 2336                                         kprintf("cache_inefficient_scan: "
 2337                                                "MATCHED inode %lld path %s/%*.*s\n",
 2338                                                (long long)vat.va_fileid,
 2339                                                nch->ncp->nc_name,
 2340                                                den->d_namlen, den->d_namlen,
 2341                                                den->d_name);
 2342                                 }
 2343                                 nlc.nlc_nameptr = den->d_name;
 2344                                 nlc.nlc_namelen = den->d_namlen;
 2345                                 rncp = cache_nlookup(nch, &nlc);
 2346                                 KKASSERT(rncp.ncp != NULL);
 2347                                 break;
 2348                         }
 2349                         bytes -= _DIRENT_DIRSIZ(den);
 2350                         den = _DIRENT_NEXT(den);
 2351                 }
 2352                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
 2353                         goto again;
 2354         }
 2355         kfree(rbuf, M_TEMP);
 2356 done:
 2357         vrele(pvp);
 2358         if (rncp.ncp) {
 2359                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
 2360                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
 2361                         if (ncvp_debug >= 2) {
 2362                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
 2363                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
 2364                         }
 2365                 } else {
 2366                         if (ncvp_debug >= 2) {
 2367                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 
 2368                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
 2369                                         rncp.ncp->nc_vp);
 2370                         }
 2371                 }
 2372                 if (rncp.ncp->nc_vp == NULL)
 2373                         error = rncp.ncp->nc_error;
 2374                 /* 
 2375                  * Release rncp after a successful nlookup.  rncp was fully
 2376                  * referenced.
 2377                  */
 2378                 cache_put(&rncp);
 2379         } else {
 2380                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
 2381                         dvp, nch->ncp->nc_name);
 2382                 error = ENOENT;
 2383         }
 2384         return (error);
 2385 }
 2386 
 2387 /*
 2388  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
 2389  * state, which disassociates it from its vnode or ncneglist.
 2390  *
 2391  * Then, if there are no additional references to the ncp and no children,
 2392  * the ncp is removed from the topology and destroyed.
 2393  *
 2394  * References and/or children may exist if the ncp is in the middle of the
 2395  * topology, preventing the ncp from being destroyed.
 2396  *
 2397  * This function must be called with the ncp held and locked and will unlock
 2398  * and drop it during zapping.
 2399  *
 2400  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
 2401  * This case can occur in the cache_drop() path.
 2402  *
 2403  * This function may returned a held (but NOT locked) parent node which the
 2404  * caller must drop.  We do this so _cache_drop() can loop, to avoid
 2405  * blowing out the kernel stack.
 2406  *
 2407  * WARNING!  For MPSAFE operation this routine must acquire up to three
 2408  *           spin locks to be able to safely test nc_refs.  Lock order is
 2409  *           very important.
 2410  *
 2411  *           hash spinlock if on hash list
 2412  *           parent spinlock if child of parent
 2413  *           (the ncp is unresolved so there is no vnode association)
 2414  */
 2415 static struct namecache *
 2416 cache_zap(struct namecache *ncp, int nonblock)
 2417 {
 2418         struct namecache *par;
 2419         struct vnode *dropvp;
 2420         int refs;
 2421 
 2422         /*
 2423          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
 2424          */
 2425         _cache_setunresolved(ncp);
 2426 
 2427         /*
 2428          * Try to scrap the entry and possibly tail-recurse on its parent.
 2429          * We only scrap unref'd (other then our ref) unresolved entries,
 2430          * we do not scrap 'live' entries.
 2431          *
 2432          * Note that once the spinlocks are acquired if nc_refs == 1 no
 2433          * other references are possible.  If it isn't, however, we have
 2434          * to decrement but also be sure to avoid a 1->0 transition.
 2435          */
 2436         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
 2437         KKASSERT(ncp->nc_refs > 0);
 2438 
 2439         /*
 2440          * Acquire locks.  Note that the parent can't go away while we hold
 2441          * a child locked.
 2442          */
 2443         if ((par = ncp->nc_parent) != NULL) {
 2444                 if (nonblock) {
 2445                         for (;;) {
 2446                                 if (_cache_lock_nonblock(par) == 0)
 2447                                         break;
 2448                                 refs = ncp->nc_refs;
 2449                                 ncp->nc_flag |= NCF_DEFEREDZAP;
 2450                                 ++numdefered;   /* MP race ok */
 2451                                 if (atomic_cmpset_int(&ncp->nc_refs,
 2452                                                       refs, refs - 1)) {
 2453                                         _cache_unlock(ncp);
 2454                                         return(NULL);
 2455                                 }
 2456                                 cpu_pause();
 2457                         }
 2458                         _cache_hold(par);
 2459                 } else {
 2460                         _cache_hold(par);
 2461                         _cache_lock(par);
 2462                 }
 2463                 spin_lock(&ncp->nc_head->spin);
 2464         }
 2465 
 2466         /*
 2467          * If someone other then us has a ref or we have children
 2468          * we cannot zap the entry.  The 1->0 transition and any
 2469          * further list operation is protected by the spinlocks
 2470          * we have acquired but other transitions are not.
 2471          */
 2472         for (;;) {
 2473                 refs = ncp->nc_refs;
 2474                 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
 2475                         break;
 2476                 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
 2477                         if (par) {
 2478                                 spin_unlock(&ncp->nc_head->spin);
 2479                                 _cache_put(par);
 2480                         }
 2481                         _cache_unlock(ncp);
 2482                         return(NULL);
 2483                 }
 2484                 cpu_pause();
 2485         }
 2486 
 2487         /*
 2488          * We are the only ref and with the spinlocks held no further
 2489          * refs can be acquired by others.
 2490          *
 2491          * Remove us from the hash list and parent list.  We have to
 2492          * drop a ref on the parent's vp if the parent's list becomes
 2493          * empty.
 2494          */
 2495         dropvp = NULL;
 2496         if (par) {
 2497                 struct nchash_head *nchpp = ncp->nc_head;
 2498 
 2499                 KKASSERT(nchpp != NULL);
 2500                 LIST_REMOVE(ncp, nc_hash);
 2501                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 2502                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
 2503                         dropvp = par->nc_vp;
 2504                 ncp->nc_head = NULL;
 2505                 ncp->nc_parent = NULL;
 2506                 spin_unlock(&nchpp->spin);
 2507                 _cache_unlock(par);
 2508         } else {
 2509                 KKASSERT(ncp->nc_head == NULL);
 2510         }
 2511 
 2512         /*
 2513          * ncp should not have picked up any refs.  Physically
 2514          * destroy the ncp.
 2515          */
 2516         KKASSERT(ncp->nc_refs == 1);
 2517         /* _cache_unlock(ncp) not required */
 2518         ncp->nc_refs = -1;      /* safety */
 2519         if (ncp->nc_name)
 2520                 kfree(ncp->nc_name, M_VFSCACHE);
 2521         kfree(ncp, M_VFSCACHE);
 2522 
 2523         /*
 2524          * Delayed drop (we had to release our spinlocks)
 2525          *
 2526          * The refed parent (if not  NULL) must be dropped.  The
 2527          * caller is responsible for looping.
 2528          */
 2529         if (dropvp)
 2530                 vdrop(dropvp);
 2531         return(par);
 2532 }
 2533 
 2534 /*
 2535  * Clean up dangling negative cache and defered-drop entries in the
 2536  * namecache.
 2537  *
 2538  * This routine is called in the critical path and also called from
 2539  * vnlru().  When called from vnlru we use a lower limit to try to
 2540  * deal with the negative cache before the critical path has to start
 2541  * dealing with it.
 2542  */
 2543 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
 2544 
 2545 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
 2546 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
 2547 
 2548 void
 2549 cache_hysteresis(int critpath)
 2550 {
 2551         int poslimit;
 2552         int neglimit = desiredvnodes / ncnegfactor;
 2553         int xnumcache = numcache;
 2554 
 2555         if (critpath == 0)
 2556                 neglimit = neglimit * 8 / 10;
 2557 
 2558         /*
 2559          * Don't cache too many negative hits.  We use hysteresis to reduce
 2560          * the impact on the critical path.
 2561          */
 2562         switch(neg_cache_hysteresis_state[critpath]) {
 2563         case CHI_LOW:
 2564                 if (numneg > MINNEG && numneg > neglimit) {
 2565                         if (critpath)
 2566                                 _cache_cleanneg(ncnegflush);
 2567                         else
 2568                                 _cache_cleanneg(ncnegflush +
 2569                                                 numneg - neglimit);
 2570                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
 2571                 }
 2572                 break;
 2573         case CHI_HIGH:
 2574                 if (numneg > MINNEG * 9 / 10 && 
 2575                     numneg * 9 / 10 > neglimit
 2576                 ) {
 2577                         if (critpath)
 2578                                 _cache_cleanneg(ncnegflush);
 2579                         else
 2580                                 _cache_cleanneg(ncnegflush +
 2581                                                 numneg * 9 / 10 - neglimit);
 2582                 } else {
 2583                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
 2584                 }
 2585                 break;
 2586         }
 2587 
 2588         /*
 2589          * Don't cache too many positive hits.  We use hysteresis to reduce
 2590          * the impact on the critical path.
 2591          *
 2592          * Excessive positive hits can accumulate due to large numbers of
 2593          * hardlinks (the vnode cache will not prevent hl ncps from growing
 2594          * into infinity).
 2595          */
 2596         if ((poslimit = ncposlimit) == 0)
 2597                 poslimit = desiredvnodes * 2;
 2598         if (critpath == 0)
 2599                 poslimit = poslimit * 8 / 10;
 2600 
 2601         switch(pos_cache_hysteresis_state[critpath]) {
 2602         case CHI_LOW:
 2603                 if (xnumcache > poslimit && xnumcache > MINPOS) {
 2604                         if (critpath)
 2605                                 _cache_cleanpos(ncposflush);
 2606                         else
 2607                                 _cache_cleanpos(ncposflush +
 2608                                                 xnumcache - poslimit);
 2609                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
 2610                 }
 2611                 break;
 2612         case CHI_HIGH:
 2613                 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
 2614                         if (critpath)
 2615                                 _cache_cleanpos(ncposflush);
 2616                         else
 2617                                 _cache_cleanpos(ncposflush +
 2618                                                 xnumcache - poslimit * 5 / 6);
 2619                 } else {
 2620                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
 2621                 }
 2622                 break;
 2623         }
 2624 
 2625         /*
 2626          * Clean out dangling defered-zap ncps which could not
 2627          * be cleanly dropped if too many build up.  Note
 2628          * that numdefered is not an exact number as such ncps
 2629          * can be reused and the counter is not handled in a MP
 2630          * safe manner by design.
 2631          */
 2632         if (numdefered > neglimit) {
 2633                 _cache_cleandefered();
 2634         }
 2635 }
 2636 
 2637 /*
 2638  * NEW NAMECACHE LOOKUP API
 2639  *
 2640  * Lookup an entry in the namecache.  The passed par_nch must be referenced
 2641  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
 2642  * is ALWAYS returned, eve if the supplied component is illegal.
 2643  *
 2644  * The resulting namecache entry should be returned to the system with
 2645  * cache_put() or cache_unlock() + cache_drop().
 2646  *
 2647  * namecache locks are recursive but care must be taken to avoid lock order
 2648  * reversals (hence why the passed par_nch must be unlocked).  Locking
 2649  * rules are to order for parent traversals, not for child traversals.
 2650  *
 2651  * Nobody else will be able to manipulate the associated namespace (e.g.
 2652  * create, delete, rename, rename-target) until the caller unlocks the
 2653  * entry.
 2654  *
 2655  * The returned entry will be in one of three states:  positive hit (non-null
 2656  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
 2657  * Unresolved entries must be resolved through the filesystem to associate the
 2658  * vnode and/or determine whether a positive or negative hit has occured.
 2659  *
 2660  * It is not necessary to lock a directory in order to lock namespace under
 2661  * that directory.  In fact, it is explicitly not allowed to do that.  A
 2662  * directory is typically only locked when being created, renamed, or
 2663  * destroyed.
 2664  *
 2665  * The directory (par) may be unresolved, in which case any returned child
 2666  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
 2667  * the filesystem lookup requires a resolved directory vnode the caller is
 2668  * responsible for resolving the namecache chain top-down.  This API 
 2669  * specifically allows whole chains to be created in an unresolved state.
 2670  */
 2671 struct nchandle
 2672 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
 2673 {
 2674         struct nchandle nch;
 2675         struct namecache *ncp;
 2676         struct namecache *new_ncp;
 2677         struct nchash_head *nchpp;
 2678         struct mount *mp;
 2679         u_int32_t hash;
 2680         globaldata_t gd;
 2681         int par_locked;
 2682 
 2683         numcalls++;
 2684         gd = mycpu;
 2685         mp = par_nch->mount;
 2686         par_locked = 0;
 2687 
 2688         /*
 2689          * This is a good time to call it, no ncp's are locked by
 2690          * the caller or us.
 2691          */
 2692         cache_hysteresis(1);
 2693 
 2694         /*
 2695          * Try to locate an existing entry
 2696          */
 2697         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
 2698         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
 2699         new_ncp = NULL;
 2700         nchpp = NCHHASH(hash);
 2701 restart:
 2702         if (new_ncp)
 2703                 spin_lock(&nchpp->spin);
 2704         else
 2705                 spin_lock_shared(&nchpp->spin);
 2706 
 2707         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
 2708                 numchecks++;
 2709 
 2710                 /*
 2711                  * Break out if we find a matching entry.  Note that
 2712                  * UNRESOLVED entries may match, but DESTROYED entries
 2713                  * do not.
 2714                  */
 2715                 if (ncp->nc_parent == par_nch->ncp &&
 2716                     ncp->nc_nlen == nlc->nlc_namelen &&
 2717                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
 2718                     (ncp->nc_flag & NCF_DESTROYED) == 0
 2719                 ) {
 2720                         _cache_hold(ncp);
 2721                         if (new_ncp)
 2722                                 spin_unlock(&nchpp->spin);
 2723                         else
 2724                                 spin_unlock_shared(&nchpp->spin);
 2725                         if (par_locked) {
 2726                                 _cache_unlock(par_nch->ncp);
 2727                                 par_locked = 0;
 2728                         }
 2729                         if (_cache_lock_special(ncp) == 0) {
 2730                                 /*
 2731                                  * Successfully locked but we must re-test
 2732                                  * conditions that might have changed since
 2733                                  * we did not have the lock before.
 2734                                  */
 2735                                 if ((ncp->nc_flag & NCF_DESTROYED) ||
 2736                                     ncp->nc_parent != par_nch->ncp) {
 2737                                         _cache_put(ncp);
 2738                                         goto restart;
 2739                                 }
 2740                                 _cache_auto_unresolve(mp, ncp);
 2741                                 if (new_ncp)
 2742                                         _cache_free(new_ncp);
 2743                                 goto found;
 2744                         }
 2745                         _cache_get(ncp);        /* cycle the lock to block */
 2746                         _cache_put(ncp);
 2747                         _cache_drop(ncp);
 2748                         goto restart;
 2749                 }
 2750         }
 2751 
 2752         /*
 2753          * We failed to locate an entry, create a new entry and add it to
 2754          * the cache.  The parent ncp must also be locked so we
 2755          * can link into it.
 2756          *
 2757          * We have to relookup after possibly blocking in kmalloc or
 2758          * when locking par_nch.
 2759          *
 2760          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
 2761          *       mount case, in which case nc_name will be NULL.
 2762          */
 2763         if (new_ncp == NULL) {
 2764                 spin_unlock_shared(&nchpp->spin);
 2765                 new_ncp = cache_alloc(nlc->nlc_namelen);
 2766                 if (nlc->nlc_namelen) {
 2767                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
 2768                               nlc->nlc_namelen);
 2769                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
 2770                 }
 2771                 goto restart;
 2772         }
 2773 
 2774         /*
 2775          * NOTE! The spinlock is held exclusively here because new_ncp
 2776          *       is non-NULL.
 2777          */
 2778         if (par_locked == 0) {
 2779                 spin_unlock(&nchpp->spin);
 2780                 _cache_lock(par_nch->ncp);
 2781                 par_locked = 1;
 2782                 goto restart;
 2783         }
 2784 
 2785         /*
 2786          * WARNING!  We still hold the spinlock.  We have to set the hash
 2787          *           table entry atomically.
 2788          */
 2789         ncp = new_ncp;
 2790         _cache_link_parent(ncp, par_nch->ncp, nchpp);
 2791         spin_unlock(&nchpp->spin);
 2792         _cache_unlock(par_nch->ncp);
 2793         /* par_locked = 0 - not used */
 2794 found:
 2795         /*
 2796          * stats and namecache size management
 2797          */
 2798         if (ncp->nc_flag & NCF_UNRESOLVED)
 2799                 ++gd->gd_nchstats->ncs_miss;
 2800         else if (ncp->nc_vp)
 2801                 ++gd->gd_nchstats->ncs_goodhits;
 2802         else
 2803                 ++gd->gd_nchstats->ncs_neghits;
 2804         nch.mount = mp;
 2805         nch.ncp = ncp;
 2806         atomic_add_int(&nch.mount->mnt_refs, 1);
 2807         return(nch);
 2808 }
 2809 
 2810 /*
 2811  * Attempt to lookup a namecache entry and return with a shared namecache
 2812  * lock.
 2813  */
 2814 int
 2815 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
 2816                            int excl, struct nchandle *res_nch)
 2817 {
 2818         struct namecache *ncp;
 2819         struct nchash_head *nchpp;
 2820         struct mount *mp;
 2821         u_int32_t hash;
 2822         globaldata_t gd;
 2823 
 2824         /*
 2825          * If exclusive requested or shared namecache locks are disabled,
 2826          * return failure.
 2827          */
 2828         if (ncp_shared_lock_disable || excl)
 2829                 return(EWOULDBLOCK);
 2830 
 2831         numcalls++;
 2832         gd = mycpu;
 2833         mp = par_nch->mount;
 2834 
 2835         /*
 2836          * This is a good time to call it, no ncp's are locked by
 2837          * the caller or us.
 2838          */
 2839         cache_hysteresis(1);
 2840 
 2841         /*
 2842          * Try to locate an existing entry
 2843          */
 2844         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
 2845         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
 2846         nchpp = NCHHASH(hash);
 2847 
 2848         spin_lock_shared(&nchpp->spin);
 2849 
 2850         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
 2851                 numchecks++;
 2852 
 2853                 /*
 2854                  * Break out if we find a matching entry.  Note that
 2855                  * UNRESOLVED entries may match, but DESTROYED entries
 2856                  * do not.
 2857                  */
 2858                 if (ncp->nc_parent == par_nch->ncp &&
 2859                     ncp->nc_nlen == nlc->nlc_namelen &&
 2860                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
 2861                     (ncp->nc_flag & NCF_DESTROYED) == 0
 2862                 ) {
 2863                         _cache_hold(ncp);
 2864                         spin_unlock_shared(&nchpp->spin);
 2865                         if (_cache_lock_shared_special(ncp) == 0) {
 2866                                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
 2867                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
 2868                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
 2869                                         goto found;
 2870                                 }
 2871                                 _cache_unlock(ncp);
 2872                         }
 2873                         _cache_drop(ncp);
 2874                         spin_lock_shared(&nchpp->spin);
 2875                         break;
 2876                 }
 2877         }
 2878 
 2879         /*
 2880          * Failure
 2881          */
 2882         spin_unlock_shared(&nchpp->spin);
 2883         return(EWOULDBLOCK);
 2884 
 2885         /*
 2886          * Success
 2887          *
 2888          * Note that nc_error might be non-zero (e.g ENOENT).
 2889          */
 2890 found:
 2891         res_nch->mount = mp;
 2892         res_nch->ncp = ncp;
 2893         ++gd->gd_nchstats->ncs_goodhits;
 2894         atomic_add_int(&res_nch->mount->mnt_refs, 1);
 2895 
 2896         KKASSERT(ncp->nc_error != EWOULDBLOCK);
 2897         return(ncp->nc_error);
 2898 }
 2899 
 2900 /*
 2901  * This is a non-blocking verison of cache_nlookup() used by
 2902  * nfs_readdirplusrpc_uio().  It can fail for any reason and
 2903  * will return nch.ncp == NULL in that case.
 2904  */
 2905 struct nchandle
 2906 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
 2907 {
 2908         struct nchandle nch;
 2909         struct namecache *ncp;
 2910         struct namecache *new_ncp;
 2911         struct nchash_head *nchpp;
 2912         struct mount *mp;
 2913         u_int32_t hash;
 2914         globaldata_t gd;
 2915         int par_locked;
 2916 
 2917         numcalls++;
 2918         gd = mycpu;
 2919         mp = par_nch->mount;
 2920         par_locked = 0;
 2921 
 2922         /*
 2923          * Try to locate an existing entry
 2924          */
 2925         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
 2926         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
 2927         new_ncp = NULL;
 2928         nchpp = NCHHASH(hash);
 2929 restart:
 2930         spin_lock(&nchpp->spin);
 2931         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
 2932                 numchecks++;
 2933 
 2934                 /*
 2935                  * Break out if we find a matching entry.  Note that
 2936                  * UNRESOLVED entries may match, but DESTROYED entries
 2937                  * do not.
 2938                  */
 2939                 if (ncp->nc_parent == par_nch->ncp &&
 2940                     ncp->nc_nlen == nlc->nlc_namelen &&
 2941                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
 2942                     (ncp->nc_flag & NCF_DESTROYED) == 0
 2943                 ) {
 2944                         _cache_hold(ncp);
 2945                         spin_unlock(&nchpp->spin);
 2946                         if (par_locked) {
 2947                                 _cache_unlock(par_nch->ncp);
 2948                                 par_locked = 0;
 2949                         }
 2950                         if (_cache_lock_special(ncp) == 0) {
 2951                                 _cache_auto_unresolve(mp, ncp);
 2952                                 if (new_ncp) {
 2953                                         _cache_free(new_ncp);
 2954                                         new_ncp = NULL;
 2955                                 }
 2956                                 goto found;
 2957                         }
 2958                         _cache_drop(ncp);
 2959                         goto failed;
 2960                 }
 2961         }
 2962 
 2963         /*
 2964          * We failed to locate an entry, create a new entry and add it to
 2965          * the cache.  The parent ncp must also be locked so we
 2966          * can link into it.
 2967          *
 2968          * We have to relookup after possibly blocking in kmalloc or
 2969          * when locking par_nch.
 2970          *
 2971          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
 2972          *       mount case, in which case nc_name will be NULL.
 2973          */
 2974         if (new_ncp == NULL) {
 2975                 spin_unlock(&nchpp->spin);
 2976                 new_ncp = cache_alloc(nlc->nlc_namelen);
 2977                 if (nlc->nlc_namelen) {
 2978                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
 2979                               nlc->nlc_namelen);
 2980                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
 2981                 }
 2982                 goto restart;
 2983         }
 2984         if (par_locked == 0) {
 2985                 spin_unlock(&nchpp->spin);
 2986                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
 2987                         par_locked = 1;
 2988                         goto restart;
 2989                 }
 2990                 goto failed;
 2991         }
 2992 
 2993         /*
 2994          * WARNING!  We still hold the spinlock.  We have to set the hash
 2995          *           table entry atomically.
 2996          */
 2997         ncp = new_ncp;
 2998         _cache_link_parent(ncp, par_nch->ncp, nchpp);
 2999         spin_unlock(&nchpp->spin);
 3000         _cache_unlock(par_nch->ncp);
 3001         /* par_locked = 0 - not used */
 3002 found:
 3003         /*
 3004          * stats and namecache size management
 3005          */
 3006         if (ncp->nc_flag & NCF_UNRESOLVED)
 3007                 ++gd->gd_nchstats->ncs_miss;
 3008         else if (ncp->nc_vp)
 3009                 ++gd->gd_nchstats->ncs_goodhits;
 3010         else
 3011                 ++gd->gd_nchstats->ncs_neghits;
 3012         nch.mount = mp;
 3013         nch.ncp = ncp;
 3014         atomic_add_int(&nch.mount->mnt_refs, 1);
 3015         return(nch);
 3016 failed:
 3017         if (new_ncp) {
 3018                 _cache_free(new_ncp);
 3019                 new_ncp = NULL;
 3020         }
 3021         nch.mount = NULL;
 3022         nch.ncp = NULL;
 3023         return(nch);
 3024 }
 3025 
 3026 /*
 3027  * The namecache entry is marked as being used as a mount point. 
 3028  * Locate the mount if it is visible to the caller.  The DragonFly
 3029  * mount system allows arbitrary loops in the topology and disentangles
 3030  * those loops by matching against (mp, ncp) rather than just (ncp).
 3031  * This means any given ncp can dive any number of mounts, depending
 3032  * on the relative mount (e.g. nullfs) the caller is at in the topology.
 3033  *
 3034  * We use a very simple frontend cache to reduce SMP conflicts,
 3035  * which we have to do because the mountlist scan needs an exclusive
 3036  * lock around its ripout info list.  Not to mention that there might
 3037  * be a lot of mounts.
 3038  */
 3039 struct findmount_info {
 3040         struct mount *result;
 3041         struct mount *nch_mount;
 3042         struct namecache *nch_ncp;
 3043 };
 3044 
 3045 static
 3046 struct ncmount_cache *
 3047 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
 3048 {
 3049         int hash;
 3050 
 3051         hash = ((int)(intptr_t)mp / sizeof(*mp)) ^
 3052                ((int)(intptr_t)ncp / sizeof(*ncp));
 3053         hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE;
 3054         return (&ncmount_cache[hash]);
 3055 }
 3056 
 3057 static
 3058 int
 3059 cache_findmount_callback(struct mount *mp, void *data)
 3060 {
 3061         struct findmount_info *info = data;
 3062 
 3063         /*
 3064          * Check the mount's mounted-on point against the passed nch.
 3065          */
 3066         if (mp->mnt_ncmounton.mount == info->nch_mount &&
 3067             mp->mnt_ncmounton.ncp == info->nch_ncp
 3068         ) {
 3069             info->result = mp;
 3070             atomic_add_int(&mp->mnt_refs, 1);
 3071             return(-1);
 3072         }
 3073         return(0);
 3074 }
 3075 
 3076 struct mount *
 3077 cache_findmount(struct nchandle *nch)
 3078 {
 3079         struct findmount_info info;
 3080         struct ncmount_cache *ncc;
 3081         struct mount *mp;
 3082 
 3083         /*
 3084          * Fast
 3085          */
 3086         if (ncmount_cache_enable == 0) {
 3087                 ncc = NULL;
 3088                 goto skip;
 3089         }
 3090         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
 3091         if (ncc->ncp == nch->ncp) {
 3092                 spin_lock_shared(&ncc->spin);
 3093                 if (ncc->isneg == 0 &&
 3094                     ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
 3095                         if (mp->mnt_ncmounton.mount == nch->mount &&
 3096                             mp->mnt_ncmounton.ncp == nch->ncp) {
 3097                                 /*
 3098                                  * Cache hit (positive)
 3099                                  */
 3100                                 atomic_add_int(&mp->mnt_refs, 1);
 3101                                 spin_unlock_shared(&ncc->spin);
 3102                                 ++ncmount_cache_hit;
 3103                                 return(mp);
 3104                         }
 3105                         /* else cache miss */
 3106                 }
 3107                 if (ncc->isneg &&
 3108                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
 3109                         /*
 3110                          * Cache hit (negative)
 3111                          */
 3112                         spin_unlock_shared(&ncc->spin);
 3113                         ++ncmount_cache_hit;
 3114                         return(NULL);
 3115                 }
 3116                 spin_unlock_shared(&ncc->spin);
 3117         }
 3118 skip:
 3119 
 3120         /*
 3121          * Slow
 3122          */
 3123         info.result = NULL;
 3124         info.nch_mount = nch->mount;
 3125         info.nch_ncp = nch->ncp;
 3126         mountlist_scan(cache_findmount_callback, &info,
 3127                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
 3128 
 3129         /*
 3130          * Cache the result.
 3131          *
 3132          * Negative lookups: We cache the originating {ncp,mp}. (mp) is
 3133          *                   only used for pointer comparisons and is not
 3134          *                   referenced (otherwise there would be dangling
 3135          *                   refs).
 3136          *
 3137          * Positive lookups: We cache the originating {ncp} and the target
 3138          *                   (mp).  (mp) is referenced.
 3139          *
 3140          * Indeterminant:    If the match is undergoing an unmount we do
 3141          *                   not cache it to avoid racing cache_unmounting(),
 3142          *                   but still return the match.
 3143          */
 3144         if (ncc) {
 3145                 spin_lock(&ncc->spin);
 3146                 if (info.result == NULL) {
 3147                         if (ncc->isneg == 0 && ncc->mp)
 3148                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
 3149                         ncc->ncp = nch->ncp;
 3150                         ncc->mp = nch->mount;
 3151                         ncc->isneg = 1;
 3152                         spin_unlock(&ncc->spin);
 3153                         ++ncmount_cache_overwrite;
 3154                 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
 3155                         if (ncc->isneg == 0 && ncc->mp)
 3156                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
 3157                         atomic_add_int(&info.result->mnt_refs, 1);
 3158                         ncc->ncp = nch->ncp;
 3159                         ncc->mp = info.result;
 3160                         ncc->isneg = 0;
 3161                         spin_unlock(&ncc->spin);
 3162                         ++ncmount_cache_overwrite;
 3163                 } else {
 3164                         spin_unlock(&ncc->spin);
 3165                 }
 3166                 ++ncmount_cache_miss;
 3167         }
 3168         return(info.result);
 3169 }
 3170 
 3171 void
 3172 cache_dropmount(struct mount *mp)
 3173 {
 3174         atomic_add_int(&mp->mnt_refs, -1);
 3175 }
 3176 
 3177 void
 3178 cache_ismounting(struct mount *mp)
 3179 {
 3180         struct nchandle *nch = &mp->mnt_ncmounton;
 3181         struct ncmount_cache *ncc;
 3182 
 3183         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
 3184         if (ncc->isneg &&
 3185             ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
 3186                 spin_lock(&ncc->spin);
 3187                 if (ncc->isneg &&
 3188                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
 3189                         ncc->ncp = NULL;
 3190                         ncc->mp = NULL;
 3191                 }
 3192                 spin_unlock(&ncc->spin);
 3193         }
 3194 }
 3195 
 3196 void
 3197 cache_unmounting(struct mount *mp)
 3198 {
 3199         struct nchandle *nch = &mp->mnt_ncmounton;
 3200         struct ncmount_cache *ncc;
 3201 
 3202         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
 3203         if (ncc->isneg == 0 &&
 3204             ncc->ncp == nch->ncp && ncc->mp == mp) {
 3205                 spin_lock(&ncc->spin);
 3206                 if (ncc->isneg == 0 &&
 3207                     ncc->ncp == nch->ncp && ncc->mp == mp) {
 3208                         atomic_add_int(&mp->mnt_refs, -1);
 3209                         ncc->ncp = NULL;
 3210                         ncc->mp = NULL;
 3211                 }
 3212                 spin_unlock(&ncc->spin);
 3213         }
 3214 }
 3215 
 3216 /*
 3217  * Resolve an unresolved namecache entry, generally by looking it up.
 3218  * The passed ncp must be locked and refd. 
 3219  *
 3220  * Theoretically since a vnode cannot be recycled while held, and since
 3221  * the nc_parent chain holds its vnode as long as children exist, the
 3222  * direct parent of the cache entry we are trying to resolve should
 3223  * have a valid vnode.  If not then generate an error that we can 
 3224  * determine is related to a resolver bug.
 3225  *
 3226  * However, if a vnode was in the middle of a recyclement when the NCP
 3227  * got locked, ncp->nc_vp might point to a vnode that is about to become
 3228  * invalid.  cache_resolve() handles this case by unresolving the entry
 3229  * and then re-resolving it.
 3230  *
 3231  * Note that successful resolution does not necessarily return an error
 3232  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
 3233  * will be returned.
 3234  */
 3235 int
 3236 cache_resolve(struct nchandle *nch, struct ucred *cred)
 3237 {
 3238         struct namecache *par_tmp;
 3239         struct namecache *par;
 3240         struct namecache *ncp;
 3241         struct nchandle nctmp;
 3242         struct mount *mp;
 3243         struct vnode *dvp;
 3244         int error;
 3245 
 3246         ncp = nch->ncp;
 3247         mp = nch->mount;
 3248         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
 3249 restart:
 3250         /*
 3251          * If the ncp is already resolved we have nothing to do.  However,
 3252          * we do want to guarentee that a usable vnode is returned when
 3253          * a vnode is present, so make sure it hasn't been reclaimed.
 3254          */
 3255         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 3256                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 3257                         _cache_setunresolved(ncp);
 3258                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
 3259                         return (ncp->nc_error);
 3260         }
 3261 
 3262         /*
 3263          * If the ncp was destroyed it will never resolve again.  This
 3264          * can basically only happen when someone is chdir'd into an
 3265          * empty directory which is then rmdir'd.  We want to catch this
 3266          * here and not dive the VFS because the VFS might actually
 3267          * have a way to re-resolve the disconnected ncp, which will
 3268          * result in inconsistencies in the cdir/nch for proc->p_fd.
 3269          */
 3270         if (ncp->nc_flag & NCF_DESTROYED) {
 3271                 kprintf("Warning: cache_resolve: ncp '%s' was unlinked\n",
 3272                         ncp->nc_name);
 3273                 return(EINVAL);
 3274         }
 3275 
 3276         /*
 3277          * Mount points need special handling because the parent does not
 3278          * belong to the same filesystem as the ncp.
 3279          */
 3280         if (ncp == mp->mnt_ncmountpt.ncp)
 3281                 return (cache_resolve_mp(mp));
 3282 
 3283         /*
 3284          * We expect an unbroken chain of ncps to at least the mount point,
 3285          * and even all the way to root (but this code doesn't have to go
 3286          * past the mount point).
 3287          */
 3288         if (ncp->nc_parent == NULL) {
 3289                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
 3290                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
 3291                 ncp->nc_error = EXDEV;
 3292                 return(ncp->nc_error);
 3293         }
 3294 
 3295         /*
 3296          * The vp's of the parent directories in the chain are held via vhold()
 3297          * due to the existance of the child, and should not disappear. 
 3298          * However, there are cases where they can disappear:
 3299          *
 3300          *      - due to filesystem I/O errors.
 3301          *      - due to NFS being stupid about tracking the namespace and
 3302          *        destroys the namespace for entire directories quite often.
 3303          *      - due to forced unmounts.
 3304          *      - due to an rmdir (parent will be marked DESTROYED)
 3305          *
 3306          * When this occurs we have to track the chain backwards and resolve
 3307          * it, looping until the resolver catches up to the current node.  We
 3308          * could recurse here but we might run ourselves out of kernel stack
 3309          * so we do it in a more painful manner.  This situation really should
 3310          * not occur all that often, or if it does not have to go back too
 3311          * many nodes to resolve the ncp.
 3312          */
 3313         while ((dvp = cache_dvpref(ncp)) == NULL) {
 3314                 /*
 3315                  * This case can occur if a process is CD'd into a
 3316                  * directory which is then rmdir'd.  If the parent is marked
 3317                  * destroyed there is no point trying to resolve it.
 3318                  */
 3319                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
 3320                         return(ENOENT);
 3321                 par = ncp->nc_parent;
 3322                 _cache_hold(par);
 3323                 _cache_lock(par);
 3324                 while ((par_tmp = par->nc_parent) != NULL &&
 3325                        par_tmp->nc_vp == NULL) {
 3326                         _cache_hold(par_tmp);
 3327                         _cache_lock(par_tmp);
 3328                         _cache_put(par);
 3329                         par = par_tmp;
 3330                 }
 3331                 if (par->nc_parent == NULL) {
 3332                         kprintf("EXDEV case 2 %*.*s\n",
 3333                                 par->nc_nlen, par->nc_nlen, par->nc_name);
 3334                         _cache_put(par);
 3335                         return (EXDEV);
 3336                 }
 3337                 kprintf("[diagnostic] cache_resolve: had to recurse on %*.*s\n",
 3338                         par->nc_nlen, par->nc_nlen, par->nc_name);
 3339                 /*
 3340                  * The parent is not set in stone, ref and lock it to prevent
 3341                  * it from disappearing.  Also note that due to renames it
 3342                  * is possible for our ncp to move and for par to no longer
 3343                  * be one of its parents.  We resolve it anyway, the loop 
 3344                  * will handle any moves.
 3345                  */
 3346                 _cache_get(par);        /* additional hold/lock */
 3347                 _cache_put(par);        /* from earlier hold/lock */
 3348                 if (par == nch->mount->mnt_ncmountpt.ncp) {
 3349                         cache_resolve_mp(nch->mount);
 3350                 } else if ((dvp = cache_dvpref(par)) == NULL) {
 3351                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
 3352                         _cache_put(par);
 3353                         continue;
 3354                 } else {
 3355                         if (par->nc_flag & NCF_UNRESOLVED) {
 3356                                 nctmp.mount = mp;
 3357                                 nctmp.ncp = par;
 3358                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
 3359                         }
 3360                         vrele(dvp);
 3361                 }
 3362                 if ((error = par->nc_error) != 0) {
 3363                         if (par->nc_error != EAGAIN) {
 3364                                 kprintf("EXDEV case 3 %*.*s error %d\n",
 3365                                     par->nc_nlen, par->nc_nlen, par->nc_name,
 3366                                     par->nc_error);
 3367                                 _cache_put(par);
 3368                                 return(error);
 3369                         }
 3370                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
 3371                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
 3372                 }
 3373                 _cache_put(par);
 3374                 /* loop */
 3375         }
 3376 
 3377         /*
 3378          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
 3379          * ncp's and reattach them.  If this occurs the original ncp is marked
 3380          * EAGAIN to force a relookup.
 3381          *
 3382          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
 3383          * ncp must already be resolved.
 3384          */
 3385         if (dvp) {
 3386                 nctmp.mount = mp;
 3387                 nctmp.ncp = ncp;
 3388                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
 3389                 vrele(dvp);
 3390         } else {
 3391                 ncp->nc_error = EPERM;
 3392         }
 3393         if (ncp->nc_error == EAGAIN) {
 3394                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
 3395                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
 3396                 goto restart;
 3397         }
 3398         return(ncp->nc_error);
 3399 }
 3400 
 3401 /*
 3402  * Resolve the ncp associated with a mount point.  Such ncp's almost always
 3403  * remain resolved and this routine is rarely called.  NFS MPs tends to force
 3404  * re-resolution more often due to its mac-truck-smash-the-namecache
 3405  * method of tracking namespace changes.
 3406  *
 3407  * The semantics for this call is that the passed ncp must be locked on
 3408  * entry and will be locked on return.  However, if we actually have to
 3409  * resolve the mount point we temporarily unlock the entry in order to
 3410  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
 3411  * the unlock we have to recheck the flags after we relock.
 3412  */
 3413 static int
 3414 cache_resolve_mp(struct mount *mp)
 3415 {
 3416         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
 3417         struct vnode *vp;
 3418         int error;
 3419 
 3420         KKASSERT(mp != NULL);
 3421 
 3422         /*
 3423          * If the ncp is already resolved we have nothing to do.  However,
 3424          * we do want to guarentee that a usable vnode is returned when
 3425          * a vnode is present, so make sure it hasn't been reclaimed.
 3426          */
 3427         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 3428                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 3429                         _cache_setunresolved(ncp);
 3430         }
 3431 
 3432         if (ncp->nc_flag & NCF_UNRESOLVED) {
 3433                 _cache_unlock(ncp);
 3434                 while (vfs_busy(mp, 0))
 3435                         ;
 3436                 error = VFS_ROOT(mp, &vp);
 3437                 _cache_lock(ncp);
 3438 
 3439                 /*
 3440                  * recheck the ncp state after relocking.
 3441                  */
 3442                 if (ncp->nc_flag & NCF_UNRESOLVED) {
 3443                         ncp->nc_error = error;
 3444                         if (error == 0) {
 3445                                 _cache_setvp(mp, ncp, vp);
 3446                                 vput(vp);
 3447                         } else {
 3448                                 kprintf("[diagnostic] cache_resolve_mp: failed"
 3449                                         " to resolve mount %p err=%d ncp=%p\n",
 3450                                         mp, error, ncp);
 3451                                 _cache_setvp(mp, ncp, NULL);
 3452                         }
 3453                 } else if (error == 0) {
 3454                         vput(vp);
 3455                 }
 3456                 vfs_unbusy(mp);
 3457         }
 3458         return(ncp->nc_error);
 3459 }
 3460 
 3461 /*
 3462  * Clean out negative cache entries when too many have accumulated.
 3463  */
 3464 static void
 3465 _cache_cleanneg(int count)
 3466 {
 3467         struct namecache *ncp;
 3468 
 3469         /*
 3470          * Attempt to clean out the specified number of negative cache
 3471          * entries.
 3472          */
 3473         while (count) {
 3474                 spin_lock(&ncspin);
 3475                 ncp = TAILQ_FIRST(&ncneglist);
 3476                 if (ncp == NULL) {
 3477                         spin_unlock(&ncspin);
 3478                         break;
 3479                 }
 3480                 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
 3481                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
 3482                 _cache_hold(ncp);
 3483                 spin_unlock(&ncspin);
 3484 
 3485                 /*
 3486                  * This can race, so we must re-check that the ncp
 3487                  * is on the ncneglist after successfully locking it.
 3488                  */
 3489                 if (_cache_lock_special(ncp) == 0) {
 3490                         if (ncp->nc_vp == NULL &&
 3491                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 3492                                 ncp = cache_zap(ncp, 1);
 3493                                 if (ncp)
 3494                                         _cache_drop(ncp);
 3495                         } else {
 3496                                 kprintf("cache_cleanneg: race avoided\n");
 3497                                 _cache_unlock(ncp);
 3498                         }
 3499                 } else {
 3500                         _cache_drop(ncp);
 3501                 }
 3502                 --count;
 3503         }
 3504 }
 3505 
 3506 /*
 3507  * Clean out positive cache entries when too many have accumulated.
 3508  */
 3509 static void
 3510 _cache_cleanpos(int count)
 3511 {
 3512         static volatile int rover;
 3513         struct nchash_head *nchpp;
 3514         struct namecache *ncp;
 3515         int rover_copy;
 3516 
 3517         /*
 3518          * Attempt to clean out the specified number of negative cache
 3519          * entries.
 3520          */
 3521         while (count) {
 3522                 rover_copy = ++rover;   /* MPSAFEENOUGH */
 3523                 cpu_ccfence();
 3524                 nchpp = NCHHASH(rover_copy);
 3525 
 3526                 spin_lock_shared(&nchpp->spin);
 3527                 ncp = LIST_FIRST(&nchpp->list);
 3528                 while (ncp && (ncp->nc_flag & NCF_DESTROYED))
 3529                         ncp = LIST_NEXT(ncp, nc_hash);
 3530                 if (ncp)
 3531                         _cache_hold(ncp);
 3532                 spin_unlock_shared(&nchpp->spin);
 3533 
 3534                 if (ncp) {
 3535                         if (_cache_lock_special(ncp) == 0) {
 3536                                 ncp = cache_zap(ncp, 1);
 3537                                 if (ncp)
 3538                                         _cache_drop(ncp);
 3539                         } else {
 3540                                 _cache_drop(ncp);
 3541                         }
 3542                 }
 3543                 --count;
 3544         }
 3545 }
 3546 
 3547 /*
 3548  * This is a kitchen sink function to clean out ncps which we
 3549  * tried to zap from cache_drop() but failed because we were
 3550  * unable to acquire the parent lock.
 3551  *
 3552  * Such entries can also be removed via cache_inval_vp(), such
 3553  * as when unmounting.
 3554  */
 3555 static void
 3556 _cache_cleandefered(void)
 3557 {
 3558         struct nchash_head *nchpp;
 3559         struct namecache *ncp;
 3560         struct namecache dummy;
 3561         int i;
 3562 
 3563         numdefered = 0;
 3564         bzero(&dummy, sizeof(dummy));
 3565         dummy.nc_flag = NCF_DESTROYED;
 3566         dummy.nc_refs = 1;
 3567 
 3568         for (i = 0; i <= nchash; ++i) {
 3569                 nchpp = &nchashtbl[i];
 3570 
 3571                 spin_lock(&nchpp->spin);
 3572                 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
 3573                 ncp = &dummy;
 3574                 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
 3575                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
 3576                                 continue;
 3577                         LIST_REMOVE(&dummy, nc_hash);
 3578                         LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
 3579                         _cache_hold(ncp);
 3580                         spin_unlock(&nchpp->spin);
 3581                         if (_cache_lock_nonblock(ncp) == 0) {
 3582                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
 3583                                 _cache_unlock(ncp);
 3584                         }
 3585                         _cache_drop(ncp);
 3586                         spin_lock(&nchpp->spin);
 3587                         ncp = &dummy;
 3588                 }
 3589                 LIST_REMOVE(&dummy, nc_hash);
 3590                 spin_unlock(&nchpp->spin);
 3591         }
 3592 }
 3593 
 3594 /*
 3595  * Name cache initialization, from vfsinit() when we are booting
 3596  */
 3597 void
 3598 nchinit(void)
 3599 {
 3600         int i;
 3601         globaldata_t gd;
 3602 
 3603         /* initialise per-cpu namecache effectiveness statistics. */
 3604         for (i = 0; i < ncpus; ++i) {
 3605                 gd = globaldata_find(i);
 3606                 gd->gd_nchstats = &nchstats[i];
 3607         }
 3608         TAILQ_INIT(&ncneglist);
 3609         spin_init(&ncspin);
 3610         nchashtbl = hashinit_ext(desiredvnodes / 2,
 3611                                  sizeof(struct nchash_head),
 3612                                  M_VFSCACHE, &nchash);
 3613         for (i = 0; i <= (int)nchash; ++i) {
 3614                 LIST_INIT(&nchashtbl[i].list);
 3615                 spin_init(&nchashtbl[i].spin);
 3616         }
 3617         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
 3618                 spin_init(&ncmount_cache[i].spin);
 3619         nclockwarn = 5 * hz;
 3620 }
 3621 
 3622 /*
 3623  * Called from start_init() to bootstrap the root filesystem.  Returns
 3624  * a referenced, unlocked namecache record.
 3625  */
 3626 void
 3627 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
 3628 {
 3629         nch->ncp = cache_alloc(0);
 3630         nch->mount = mp;
 3631         atomic_add_int(&mp->mnt_refs, 1);
 3632         if (vp)
 3633                 _cache_setvp(nch->mount, nch->ncp, vp);
 3634 }
 3635 
 3636 /*
 3637  * vfs_cache_setroot()
 3638  *
 3639  *      Create an association between the root of our namecache and
 3640  *      the root vnode.  This routine may be called several times during
 3641  *      booting.
 3642  *
 3643  *      If the caller intends to save the returned namecache pointer somewhere
 3644  *      it must cache_hold() it.
 3645  */
 3646 void
 3647 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
 3648 {
 3649         struct vnode *ovp;
 3650         struct nchandle onch;
 3651 
 3652         ovp = rootvnode;
 3653         onch = rootnch;
 3654         rootvnode = nvp;
 3655         if (nch)
 3656                 rootnch = *nch;
 3657         else
 3658                 cache_zero(&rootnch);
 3659         if (ovp)
 3660                 vrele(ovp);
 3661         if (onch.ncp)
 3662                 cache_drop(&onch);
 3663 }
 3664 
 3665 /*
 3666  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
 3667  * topology and is being removed as quickly as possible.  The new VOP_N*()
 3668  * API calls are required to make specific adjustments using the supplied
 3669  * ncp pointers rather then just bogusly purging random vnodes.
 3670  *
 3671  * Invalidate all namecache entries to a particular vnode as well as 
 3672  * any direct children of that vnode in the namecache.  This is a 
 3673  * 'catch all' purge used by filesystems that do not know any better.
 3674  *
 3675  * Note that the linkage between the vnode and its namecache entries will
 3676  * be removed, but the namecache entries themselves might stay put due to
 3677  * active references from elsewhere in the system or due to the existance of
 3678  * the children.   The namecache topology is left intact even if we do not
 3679  * know what the vnode association is.  Such entries will be marked
 3680  * NCF_UNRESOLVED.
 3681  */
 3682 void
 3683 cache_purge(struct vnode *vp)
 3684 {
 3685         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
 3686 }
 3687 
 3688 /*
 3689  * Flush all entries referencing a particular filesystem.
 3690  *
 3691  * Since we need to check it anyway, we will flush all the invalid
 3692  * entries at the same time.
 3693  */
 3694 #if 0
 3695 
 3696 void
 3697 cache_purgevfs(struct mount *mp)
 3698 {
 3699         struct nchash_head *nchpp;
 3700         struct namecache *ncp, *nnp;
 3701 
 3702         /*
 3703          * Scan hash tables for applicable entries.
 3704          */
 3705         for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
 3706                 spin_lock_wr(&nchpp->spin); XXX
 3707                 ncp = LIST_FIRST(&nchpp->list);
 3708                 if (ncp)
 3709                         _cache_hold(ncp);
 3710                 while (ncp) {
 3711                         nnp = LIST_NEXT(ncp, nc_hash);
 3712                         if (nnp)
 3713                                 _cache_hold(nnp);
 3714                         if (ncp->nc_mount == mp) {
 3715                                 _cache_lock(ncp);
 3716                                 ncp = cache_zap(ncp, 0);
 3717                                 if (ncp)
 3718                                         _cache_drop(ncp);
 3719                         } else {
 3720                                 _cache_drop(ncp);
 3721                         }
 3722                         ncp = nnp;
 3723                 }
 3724                 spin_unlock_wr(&nchpp->spin); XXX
 3725         }
 3726 }
 3727 
 3728 #endif
 3729 
 3730 static int disablecwd;
 3731 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
 3732     "Disable getcwd");
 3733 
 3734 static u_long numcwdcalls;
 3735 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
 3736     "Number of current directory resolution calls");
 3737 static u_long numcwdfailnf;
 3738 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
 3739     "Number of current directory failures due to lack of file");
 3740 static u_long numcwdfailsz;
 3741 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
 3742     "Number of current directory failures due to large result");
 3743 static u_long numcwdfound;
 3744 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
 3745     "Number of current directory resolution successes");
 3746 
 3747 /*
 3748  * MPALMOSTSAFE
 3749  */
 3750 int
 3751 sys___getcwd(struct __getcwd_args *uap)
 3752 {
 3753         u_int buflen;
 3754         int error;
 3755         char *buf;
 3756         char *bp;
 3757 
 3758         if (disablecwd)
 3759                 return (ENODEV);
 3760 
 3761         buflen = uap->buflen;
 3762         if (buflen == 0)
 3763                 return (EINVAL);
 3764         if (buflen > MAXPATHLEN)
 3765                 buflen = MAXPATHLEN;
 3766 
 3767         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
 3768         bp = kern_getcwd(buf, buflen, &error);
 3769         if (error == 0)
 3770                 error = copyout(bp, uap->buf, strlen(bp) + 1);
 3771         kfree(buf, M_TEMP);
 3772         return (error);
 3773 }
 3774 
 3775 char *
 3776 kern_getcwd(char *buf, size_t buflen, int *error)
 3777 {
 3778         struct proc *p = curproc;
 3779         char *bp;
 3780         int i, slash_prefixed;
 3781         struct filedesc *fdp;
 3782         struct nchandle nch;
 3783         struct namecache *ncp;
 3784 
 3785         numcwdcalls++;
 3786         bp = buf;
 3787         bp += buflen - 1;
 3788         *bp = '\0';
 3789         fdp = p->p_fd;
 3790         slash_prefixed = 0;
 3791 
 3792         nch = fdp->fd_ncdir;
 3793         ncp = nch.ncp;
 3794         if (ncp)
 3795                 _cache_hold(ncp);
 3796 
 3797         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
 3798                nch.mount != fdp->fd_nrdir.mount)
 3799         ) {
 3800                 /*
 3801                  * While traversing upwards if we encounter the root
 3802                  * of the current mount we have to skip to the mount point
 3803                  * in the underlying filesystem.
 3804                  */
 3805                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
 3806                         nch = nch.mount->mnt_ncmounton;
 3807                         _cache_drop(ncp);
 3808                         ncp = nch.ncp;
 3809                         if (ncp)
 3810                                 _cache_hold(ncp);
 3811                         continue;
 3812                 }
 3813 
 3814                 /*
 3815                  * Prepend the path segment
 3816                  */
 3817                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
 3818                         if (bp == buf) {
 3819                                 numcwdfailsz++;
 3820                                 *error = ERANGE;
 3821                                 bp = NULL;
 3822                                 goto done;
 3823                         }
 3824                         *--bp = ncp->nc_name[i];
 3825                 }
 3826                 if (bp == buf) {
 3827                         numcwdfailsz++;
 3828                         *error = ERANGE;
 3829                         bp = NULL;
 3830                         goto done;
 3831                 }
 3832                 *--bp = '/';
 3833                 slash_prefixed = 1;
 3834 
 3835                 /*
 3836                  * Go up a directory.  This isn't a mount point so we don't
 3837                  * have to check again.
 3838                  */
 3839                 while ((nch.ncp = ncp->nc_parent) != NULL) {
 3840                         if (ncp_shared_lock_disable)
 3841                                 _cache_lock(ncp);
 3842                         else
 3843                                 _cache_lock_shared(ncp);
 3844                         if (nch.ncp != ncp->nc_parent) {
 3845                                 _cache_unlock(ncp);
 3846                                 continue;
 3847                         }
 3848                         _cache_hold(nch.ncp);
 3849                         _cache_unlock(ncp);
 3850                         break;
 3851                 }
 3852                 _cache_drop(ncp);
 3853                 ncp = nch.ncp;
 3854         }
 3855         if (ncp == NULL) {
 3856                 numcwdfailnf++;
 3857                 *error = ENOENT;
 3858                 bp = NULL;
 3859                 goto done;
 3860         }
 3861         if (!slash_prefixed) {
 3862                 if (bp == buf) {
 3863                         numcwdfailsz++;
 3864                         *error = ERANGE;
 3865                         bp = NULL;
 3866                         goto done;
 3867                 }
 3868                 *--bp = '/';
 3869         }
 3870         numcwdfound++;
 3871         *error = 0;
 3872 done:
 3873         if (ncp)
 3874                 _cache_drop(ncp);
 3875         return (bp);
 3876 }
 3877 
 3878 /*
 3879  * Thus begins the fullpath magic.
 3880  *
 3881  * The passed nchp is referenced but not locked.
 3882  */
 3883 static int disablefullpath;
 3884 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
 3885     &disablefullpath, 0,
 3886     "Disable fullpath lookups");
 3887 
 3888 static u_int numfullpathcalls;
 3889 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
 3890     &numfullpathcalls, 0,
 3891     "Number of full path resolutions in progress");
 3892 static u_int numfullpathfailnf;
 3893 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
 3894     &numfullpathfailnf, 0,
 3895     "Number of full path resolution failures due to lack of file");
 3896 static u_int numfullpathfailsz;
 3897 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
 3898     &numfullpathfailsz, 0,
 3899     "Number of full path resolution failures due to insufficient memory");
 3900 static u_int numfullpathfound;
 3901 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
 3902     &numfullpathfound, 0,
 3903     "Number of full path resolution successes");
 3904 
 3905 int
 3906 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
 3907                char **retbuf, char **freebuf, int guess)
 3908 {
 3909         struct nchandle fd_nrdir;
 3910         struct nchandle nch;
 3911         struct namecache *ncp;
 3912         struct mount *mp, *new_mp;
 3913         char *bp, *buf;
 3914         int slash_prefixed;
 3915         int error = 0;
 3916         int i;
 3917 
 3918         atomic_add_int(&numfullpathcalls, -1);
 3919 
 3920         *retbuf = NULL; 
 3921         *freebuf = NULL;
 3922 
 3923         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 3924         bp = buf + MAXPATHLEN - 1;
 3925         *bp = '\0';
 3926         if (nchbase)
 3927                 fd_nrdir = *nchbase;
 3928         else if (p != NULL)
 3929                 fd_nrdir = p->p_fd->fd_nrdir;
 3930         else
 3931                 fd_nrdir = rootnch;
 3932         slash_prefixed = 0;
 3933         nch = *nchp;
 3934         ncp = nch.ncp;
 3935         if (ncp)
 3936                 _cache_hold(ncp);
 3937         mp = nch.mount;
 3938 
 3939         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
 3940                 new_mp = NULL;
 3941 
 3942                 /*
 3943                  * If we are asked to guess the upwards path, we do so whenever
 3944                  * we encounter an ncp marked as a mountpoint. We try to find
 3945                  * the actual mountpoint by finding the mountpoint with this
 3946                  * ncp.
 3947                  */
 3948                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
 3949                         new_mp = mount_get_by_nc(ncp);
 3950                 }
 3951                 /*
 3952                  * While traversing upwards if we encounter the root
 3953                  * of the current mount we have to skip to the mount point.
 3954                  */
 3955                 if (ncp == mp->mnt_ncmountpt.ncp) {
 3956                         new_mp = mp;
 3957                 }
 3958                 if (new_mp) {
 3959                         nch = new_mp->mnt_ncmounton;
 3960                         _cache_drop(ncp);
 3961                         ncp = nch.ncp;
 3962                         if (ncp)
 3963                                 _cache_hold(ncp);
 3964                         mp = nch.mount;
 3965                         continue;
 3966                 }
 3967 
 3968                 /*
 3969                  * Prepend the path segment
 3970                  */
 3971                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
 3972                         if (bp == buf) {
 3973                                 numfullpathfailsz++;
 3974                                 kfree(buf, M_TEMP);
 3975                                 error = ENOMEM;
 3976                                 goto done;
 3977                         }
 3978                         *--bp = ncp->nc_name[i];
 3979                 }
 3980                 if (bp == buf) {
 3981                         numfullpathfailsz++;
 3982                         kfree(buf, M_TEMP);
 3983                         error = ENOMEM;
 3984                         goto done;
 3985                 }
 3986                 *--bp = '/';
 3987                 slash_prefixed = 1;
 3988 
 3989                 /*
 3990                  * Go up a directory.  This isn't a mount point so we don't
 3991                  * have to check again.
 3992                  *
 3993                  * We can only safely access nc_parent with ncp held locked.
 3994                  */
 3995                 while ((nch.ncp = ncp->nc_parent) != NULL) {
 3996                         _cache_lock(ncp);
 3997                         if (nch.ncp != ncp->nc_parent) {
 3998                                 _cache_unlock(ncp);
 3999                                 continue;
 4000                         }
 4001                         _cache_hold(nch.ncp);
 4002                         _cache_unlock(ncp);
 4003                         break;
 4004                 }
 4005                 _cache_drop(ncp);
 4006                 ncp = nch.ncp;
 4007         }
 4008         if (ncp == NULL) {
 4009                 numfullpathfailnf++;
 4010                 kfree(buf, M_TEMP);
 4011                 error = ENOENT;
 4012                 goto done;
 4013         }
 4014 
 4015         if (!slash_prefixed) {
 4016                 if (bp == buf) {
 4017                         numfullpathfailsz++;
 4018                         kfree(buf, M_TEMP);
 4019                         error = ENOMEM;
 4020                         goto done;
 4021                 }
 4022                 *--bp = '/';
 4023         }
 4024         numfullpathfound++;
 4025         *retbuf = bp; 
 4026         *freebuf = buf;
 4027         error = 0;
 4028 done:
 4029         if (ncp)
 4030                 _cache_drop(ncp);
 4031         return(error);
 4032 }
 4033 
 4034 int
 4035 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
 4036             char **freebuf, int guess)
 4037 {
 4038         struct namecache *ncp;
 4039         struct nchandle nch;
 4040         int error;
 4041 
 4042         *freebuf = NULL;
 4043         atomic_add_int(&numfullpathcalls, 1);
 4044         if (disablefullpath)
 4045                 return (ENODEV);
 4046 
 4047         if (p == NULL)
 4048                 return (EINVAL);
 4049 
 4050         /* vn is NULL, client wants us to use p->p_textvp */
 4051         if (vn == NULL) {
 4052                 if ((vn = p->p_textvp) == NULL)
 4053                         return (EINVAL);
 4054         }
 4055         spin_lock_shared(&vn->v_spin);
 4056         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
 4057                 if (ncp->nc_nlen)
 4058                         break;
 4059         }
 4060         if (ncp == NULL) {
 4061                 spin_unlock_shared(&vn->v_spin);
 4062                 return (EINVAL);
 4063         }
 4064         _cache_hold(ncp);
 4065         spin_unlock_shared(&vn->v_spin);
 4066 
 4067         atomic_add_int(&numfullpathcalls, -1);
 4068         nch.ncp = ncp;
 4069         nch.mount = vn->v_mount;
 4070         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
 4071         _cache_drop(ncp);
 4072         return (error);
 4073 }

Cache object: acce1b820eadb2ff6280f38ab6204c4f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.