nfs_nfsdcache.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Rick Macklem at The University of Guelph.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 4. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD$");
   36 
   37 /*
   38  * Here is the basic algorithm:
   39  * First, some design criteria I used:
   40  * - I think a false hit is more serious than a false miss
   41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
   42  *   avoided at all cost
   43  * - A valid hit will probably happen a long time after the original reply
   44  *   and the TCP socket that the original request was received on will no
   45  *   longer be active
   46  *   (The long time delay implies to me that LRU is not appropriate.)
   47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
   48  *   in them as well as minimizing the risk of redoing retried non-idempotent
   49  *   Ops.
   50  * Because it is biased towards avoiding false hits, multiple entries with
   51  * the same xid are to be expected, especially for the case of the entry
   52  * in the cache being related to a seqid# sequenced Op.
   53  * 
   54  * The basic algorithm I'm about to code up:
   55  * - Null RPCs bypass the cache and are just done
   56  * For TCP
   57  *      - key on <xid, NFS version> (as noted above, there can be several
   58  *                                   entries with the same key)
   59  *      When a request arrives:
   60  *              For all that match key
   61  *              - if RPC# != OR request_size !=
   62  *                      - not a match with this one
   63  *              - if NFSv4 and received on same TCP socket OR
   64  *                      received on a TCP connection created before the
   65  *                      entry was cached
   66  *                      - not a match with this one
   67  *                      (V2,3 clients might retry on same TCP socket)
   68  *              - calculate checksum on first N bytes of NFS XDR
   69  *              - if checksum !=
   70  *                      - not a match for this one
   71  *              If any of the remaining ones that match has a
   72  *                      seqid_refcnt > 0
   73  *                      - not a match (go do RPC, using new cache entry)
   74  *              If one match left
   75  *                      - a hit (reply from cache)
   76  *              else
   77  *                      - miss (go do RPC, using new cache entry)
   78  * 
   79  *      During processing of NFSv4 request:
   80  *              - set a flag when a non-idempotent Op is processed
   81  *              - when an Op that uses a seqid# (Open,...) is processed
   82  *                      - if same seqid# as referenced entry in cache
   83  *                              - free new cache entry
   84  *                              - reply from referenced cache entry
   85  *                        else if next seqid# in order
   86  *                              - free referenced cache entry
   87  *                              - increment seqid_refcnt on new cache entry
   88  *                              - set pointer from Openowner/Lockowner to
   89  *                                      new cache entry (aka reference it)
   90  *                        else if first seqid# in sequence
   91  *                              - increment seqid_refcnt on new cache entry
   92  *                              - set pointer from Openowner/Lockowner to
   93  *                                      new cache entry (aka reference it)
   94  * 
   95  *      At end of RPC processing:
   96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
   97  *                      cache entry
   98  *                      - save reply in cache entry
   99  *                      - calculate checksum on first N bytes of NFS XDR
  100  *                              request
  101  *                      - note op and length of XDR request (in bytes)
  102  *                      - timestamp it
  103  *                else
  104  *                      - free new cache entry
  105  *              - Send reply (noting info for socket activity check, below)
  106  * 
  107  *      For cache entries saved above:
  108  *              - if saved since seqid_refcnt was > 0
  109  *                      - free when seqid_refcnt decrements to 0
  110  *                        (when next one in sequence is processed above, or
  111  *                         when Openowner/Lockowner is discarded)
  112  *                else { non-idempotent Op(s) }
  113  *                      - free when
  114  *                              - some further activity observed on same
  115  *                                      socket
  116  *                                (I'm not yet sure how I'm going to do
  117  *                                 this. Maybe look at the TCP connection
  118  *                                 to see if the send_tcp_sequence# is well
  119  *                                 past sent reply OR K additional RPCs
  120  *                                 replied on same socket OR?)
  121  *                        OR
  122  *                              - when very old (hours, days, weeks?)
  123  * 
  124  * For UDP (v2, 3 only), pretty much the old way:
  125  * - key on <xid, NFS version, RPC#, Client host ip#>
  126  *   (at most one entry for each key)
  127  * 
  128  * When a Request arrives:
  129  * - if a match with entry via key
  130  *      - if RPC marked In_progress
  131  *              - discard request (don't send reply)
  132  *        else
  133  *              - reply from cache
  134  *              - timestamp cache entry
  135  *   else
  136  *      - add entry to cache, marked In_progress
  137  *      - do RPC
  138  *      - when RPC done
  139  *              - if RPC# non-idempotent
  140  *                      - mark entry Done (not In_progress)
  141  *                      - save reply
  142  *                      - timestamp cache entry
  143  *                else
  144  *                      - free cache entry
  145  *              - send reply
  146  * 
  147  * Later, entries with saved replies are free'd a short time (few minutes)
  148  * after reply sent (timestamp).
  149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
  150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
  151  *              pages 53-63. San Diego, February 1989.
  152  *       for the UDP case.
  153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
  154  *      for TCP. For V3, a reply won't be saved when the flood level is
  155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
  156  *      that case. This level should be set high enough that this almost
  157  *      never happens.
  158  */
  159 #include <fs/nfs/nfsport.h>
  160 
  161 extern struct nfsstatsv1 nfsstatsv1;
  162 extern struct mtx nfsrc_udpmtx;
  163 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
  164 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
  165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
  166 
  167 SYSCTL_DECL(_vfs_nfsd);
  168 
  169 static u_int    nfsrc_tcphighwater = 0;
  170 static int
  171 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
  172 {
  173         int error, newhighwater;
  174 
  175         newhighwater = nfsrc_tcphighwater;
  176         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
  177         if (error != 0 || req->newptr == NULL)
  178                 return (error);
  179         if (newhighwater < 0)
  180                 return (EINVAL);
  181         if (newhighwater >= nfsrc_floodlevel)
  182                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
  183         nfsrc_tcphighwater = newhighwater;
  184         return (0);
  185 }
  186 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
  187     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
  188     "High water mark for TCP cache entries");
  189 
  190 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
  191 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
  192     &nfsrc_udphighwater, 0,
  193     "High water mark for UDP cache entries");
  194 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
  195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
  196     &nfsrc_tcptimeout, 0,
  197     "Timeout for TCP entries in the DRC");
  198 static u_int nfsrc_tcpnonidempotent = 1;
  199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
  200     &nfsrc_tcpnonidempotent, 0,
  201     "Enable the DRC for NFS over TCP");
  202 
  203 static int nfsrc_udpcachesize = 0;
  204 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
  205 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
  206 
  207 /*
  208  * and the reverse mapping from generic to Version 2 procedure numbers
  209  */
  210 static int newnfsv2_procid[NFS_V3NPROCS] = {
  211         NFSV2PROC_NULL,
  212         NFSV2PROC_GETATTR,
  213         NFSV2PROC_SETATTR,
  214         NFSV2PROC_LOOKUP,
  215         NFSV2PROC_NOOP,
  216         NFSV2PROC_READLINK,
  217         NFSV2PROC_READ,
  218         NFSV2PROC_WRITE,
  219         NFSV2PROC_CREATE,
  220         NFSV2PROC_MKDIR,
  221         NFSV2PROC_SYMLINK,
  222         NFSV2PROC_CREATE,
  223         NFSV2PROC_REMOVE,
  224         NFSV2PROC_RMDIR,
  225         NFSV2PROC_RENAME,
  226         NFSV2PROC_LINK,
  227         NFSV2PROC_READDIR,
  228         NFSV2PROC_NOOP,
  229         NFSV2PROC_STATFS,
  230         NFSV2PROC_NOOP,
  231         NFSV2PROC_NOOP,
  232         NFSV2PROC_NOOP,
  233 };
  234 
  235 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
  236 #define NFSRCUDPHASH(xid) \
  237         (&nfsrvudphashtbl[nfsrc_hash(xid)])
  238 #define NFSRCHASH(xid) \
  239         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
  240 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
  241 #define TRUE    1
  242 #define FALSE   0
  243 #define NFSRVCACHE_CHECKLEN     100
  244 
  245 /* True iff the rpc reply is an nfs status ONLY! */
  246 static int nfsv2_repstat[NFS_V3NPROCS] = {
  247         FALSE,
  248         FALSE,
  249         FALSE,
  250         FALSE,
  251         FALSE,
  252         FALSE,
  253         FALSE,
  254         FALSE,
  255         FALSE,
  256         FALSE,
  257         TRUE,
  258         TRUE,
  259         TRUE,
  260         TRUE,
  261         FALSE,
  262         TRUE,
  263         FALSE,
  264         FALSE,
  265         FALSE,
  266         FALSE,
  267         FALSE,
  268         FALSE,
  269 };
  270 
  271 /*
  272  * Will NFS want to work over IPv6 someday?
  273  */
  274 #define NETFAMILY(rp) \
  275                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
  276 
  277 /* local functions */
  278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  280 static void nfsrc_lock(struct nfsrvcache *rp);
  281 static void nfsrc_unlock(struct nfsrvcache *rp);
  282 static void nfsrc_wanted(struct nfsrvcache *rp);
  283 static void nfsrc_freecache(struct nfsrvcache *rp);
  284 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
  285 static void nfsrc_marksametcpconn(u_int64_t);
  286 
  287 /*
  288  * Return the correct mutex for this cache entry.
  289  */
  290 static __inline struct mtx *
  291 nfsrc_cachemutex(struct nfsrvcache *rp)
  292 {
  293 
  294         if ((rp->rc_flag & RC_UDP) != 0)
  295                 return (&nfsrc_udpmtx);
  296         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
  297 }
  298 
  299 /*
  300  * Initialize the server request cache list
  301  */
  302 void
  303 nfsrvd_initcache(void)
  304 {
  305         int i;
  306         static int inited = 0;
  307 
  308         if (inited)
  309                 return;
  310         inited = 1;
  311         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  312                 LIST_INIT(&nfsrvudphashtbl[i]);
  313                 LIST_INIT(&nfsrchash_table[i].tbl);
  314                 LIST_INIT(&nfsrcahash_table[i].tbl);
  315         }
  316         TAILQ_INIT(&nfsrvudplru);
  317         nfsrc_tcpsavedreplies = 0;
  318         nfsrc_udpcachesize = 0;
  319         nfsstatsv1.srvcache_tcppeak = 0;
  320         nfsstatsv1.srvcache_size = 0;
  321 }
  322 
  323 /*
  324  * Get a cache entry for this request. Basically just malloc a new one
  325  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
  326  */
  327 int
  328 nfsrvd_getcache(struct nfsrv_descript *nd)
  329 {
  330         struct nfsrvcache *newrp;
  331         int ret;
  332 
  333         if (nd->nd_procnum == NFSPROC_NULL)
  334                 panic("nfsd cache null");
  335         MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
  336             M_NFSRVCACHE, M_WAITOK);
  337         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
  338         if (nd->nd_flag & ND_NFSV4)
  339                 newrp->rc_flag = RC_NFSV4;
  340         else if (nd->nd_flag & ND_NFSV3)
  341                 newrp->rc_flag = RC_NFSV3;
  342         else
  343                 newrp->rc_flag = RC_NFSV2;
  344         newrp->rc_xid = nd->nd_retxid;
  345         newrp->rc_proc = nd->nd_procnum;
  346         newrp->rc_sockref = nd->nd_sockref;
  347         newrp->rc_cachetime = nd->nd_tcpconntime;
  348         if (nd->nd_flag & ND_SAMETCPCONN)
  349                 newrp->rc_flag |= RC_SAMETCPCONN;
  350         if (nd->nd_nam2 != NULL) {
  351                 newrp->rc_flag |= RC_UDP;
  352                 ret = nfsrc_getudp(nd, newrp);
  353         } else {
  354                 ret = nfsrc_gettcp(nd, newrp);
  355         }
  356         NFSEXITCODE2(0, nd);
  357         return (ret);
  358 }
  359 
  360 /*
  361  * For UDP (v2, v3):
  362  * - key on <xid, NFS version, RPC#, Client host ip#>
  363  *   (at most one entry for each key)
  364  */
  365 static int
  366 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  367 {
  368         struct nfsrvcache *rp;
  369         struct sockaddr_in *saddr;
  370         struct sockaddr_in6 *saddr6;
  371         struct nfsrvhashhead *hp;
  372         int ret = 0;
  373         struct mtx *mutex;
  374 
  375         mutex = nfsrc_cachemutex(newrp);
  376         hp = NFSRCUDPHASH(newrp->rc_xid);
  377 loop:
  378         mtx_lock(mutex);
  379         LIST_FOREACH(rp, hp, rc_hash) {
  380             if (newrp->rc_xid == rp->rc_xid &&
  381                 newrp->rc_proc == rp->rc_proc &&
  382                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  383                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
  384                         if ((rp->rc_flag & RC_LOCKED) != 0) {
  385                                 rp->rc_flag |= RC_WANTED;
  386                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  387                                     "nfsrc", 10 * hz);
  388                                 goto loop;
  389                         }
  390                         if (rp->rc_flag == 0)
  391                                 panic("nfs udp cache0");
  392                         rp->rc_flag |= RC_LOCKED;
  393                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  394                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  395                         if (rp->rc_flag & RC_INPROG) {
  396                                 nfsstatsv1.srvcache_inproghits++;
  397                                 mtx_unlock(mutex);
  398                                 ret = RC_DROPIT;
  399                         } else if (rp->rc_flag & RC_REPSTATUS) {
  400                                 /*
  401                                  * V2 only.
  402                                  */
  403                                 nfsstatsv1.srvcache_nonidemdonehits++;
  404                                 mtx_unlock(mutex);
  405                                 nfsrvd_rephead(nd);
  406                                 *(nd->nd_errp) = rp->rc_status;
  407                                 ret = RC_REPLY;
  408                                 rp->rc_timestamp = NFSD_MONOSEC +
  409                                         NFSRVCACHE_UDPTIMEOUT;
  410                         } else if (rp->rc_flag & RC_REPMBUF) {
  411                                 nfsstatsv1.srvcache_nonidemdonehits++;
  412                                 mtx_unlock(mutex);
  413                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  414                                         M_COPYALL, M_WAITOK);
  415                                 ret = RC_REPLY;
  416                                 rp->rc_timestamp = NFSD_MONOSEC +
  417                                         NFSRVCACHE_UDPTIMEOUT;
  418                         } else {
  419                                 panic("nfs udp cache1");
  420                         }
  421                         nfsrc_unlock(rp);
  422                         free((caddr_t)newrp, M_NFSRVCACHE);
  423                         goto out;
  424                 }
  425         }
  426         nfsstatsv1.srvcache_misses++;
  427         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
  428         nfsrc_udpcachesize++;
  429 
  430         newrp->rc_flag |= RC_INPROG;
  431         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
  432         if (saddr->sin_family == AF_INET)
  433                 newrp->rc_inet = saddr->sin_addr.s_addr;
  434         else if (saddr->sin_family == AF_INET6) {
  435                 saddr6 = (struct sockaddr_in6 *)saddr;
  436                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
  437                     sizeof (struct in6_addr));
  438                 newrp->rc_flag |= RC_INETIPV6;
  439         }
  440         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  441         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
  442         mtx_unlock(mutex);
  443         nd->nd_rp = newrp;
  444         ret = RC_DOIT;
  445 
  446 out:
  447         NFSEXITCODE2(0, nd);
  448         return (ret);
  449 }
  450 
  451 /*
  452  * Update a request cache entry after the rpc has been done
  453  */
  454 struct nfsrvcache *
  455 nfsrvd_updatecache(struct nfsrv_descript *nd)
  456 {
  457         struct nfsrvcache *rp;
  458         struct nfsrvcache *retrp = NULL;
  459         mbuf_t m;
  460         struct mtx *mutex;
  461 
  462         rp = nd->nd_rp;
  463         if (!rp)
  464                 panic("nfsrvd_updatecache null rp");
  465         nd->nd_rp = NULL;
  466         mutex = nfsrc_cachemutex(rp);
  467         mtx_lock(mutex);
  468         nfsrc_lock(rp);
  469         if (!(rp->rc_flag & RC_INPROG))
  470                 panic("nfsrvd_updatecache not inprog");
  471         rp->rc_flag &= ~RC_INPROG;
  472         if (rp->rc_flag & RC_UDP) {
  473                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  474                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  475         }
  476 
  477         /*
  478          * Reply from cache is a special case returned by nfsrv_checkseqid().
  479          */
  480         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
  481                 nfsstatsv1.srvcache_nonidemdonehits++;
  482                 mtx_unlock(mutex);
  483                 nd->nd_repstat = 0;
  484                 if (nd->nd_mreq)
  485                         mbuf_freem(nd->nd_mreq);
  486                 if (!(rp->rc_flag & RC_REPMBUF))
  487                         panic("reply from cache");
  488                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  489                     M_COPYALL, M_WAITOK);
  490                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  491                 nfsrc_unlock(rp);
  492                 goto out;
  493         }
  494 
  495         /*
  496          * If rc_refcnt > 0, save it
  497          * For UDP, save it if ND_SAVEREPLY is set
  498          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
  499          */
  500         if (nd->nd_repstat != NFSERR_DONTREPLY &&
  501             (rp->rc_refcnt > 0 ||
  502              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
  503              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
  504               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
  505               nfsrc_tcpnonidempotent))) {
  506                 if (rp->rc_refcnt > 0) {
  507                         if (!(rp->rc_flag & RC_NFSV4))
  508                                 panic("update_cache refcnt");
  509                         rp->rc_flag |= RC_REFCNT;
  510                 }
  511                 if ((nd->nd_flag & ND_NFSV2) &&
  512                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
  513                         rp->rc_status = nd->nd_repstat;
  514                         rp->rc_flag |= RC_REPSTATUS;
  515                         mtx_unlock(mutex);
  516                 } else {
  517                         if (!(rp->rc_flag & RC_UDP)) {
  518                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
  519                             if (nfsrc_tcpsavedreplies >
  520                                 nfsstatsv1.srvcache_tcppeak)
  521                                 nfsstatsv1.srvcache_tcppeak =
  522                                     nfsrc_tcpsavedreplies;
  523                         }
  524                         mtx_unlock(mutex);
  525                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
  526                         mtx_lock(mutex);
  527                         rp->rc_reply = m;
  528                         rp->rc_flag |= RC_REPMBUF;
  529                         mtx_unlock(mutex);
  530                 }
  531                 if (rp->rc_flag & RC_UDP) {
  532                         rp->rc_timestamp = NFSD_MONOSEC +
  533                             NFSRVCACHE_UDPTIMEOUT;
  534                         nfsrc_unlock(rp);
  535                 } else {
  536                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  537                         if (rp->rc_refcnt > 0)
  538                                 nfsrc_unlock(rp);
  539                         else
  540                                 retrp = rp;
  541                 }
  542         } else {
  543                 nfsrc_freecache(rp);
  544                 mtx_unlock(mutex);
  545         }
  546 
  547 out:
  548         NFSEXITCODE2(0, nd);
  549         return (retrp);
  550 }
  551 
  552 /*
  553  * Invalidate and, if possible, free an in prog cache entry.
  554  * Must not sleep.
  555  */
  556 void
  557 nfsrvd_delcache(struct nfsrvcache *rp)
  558 {
  559         struct mtx *mutex;
  560 
  561         mutex = nfsrc_cachemutex(rp);
  562         if (!(rp->rc_flag & RC_INPROG))
  563                 panic("nfsrvd_delcache not in prog");
  564         mtx_lock(mutex);
  565         rp->rc_flag &= ~RC_INPROG;
  566         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
  567                 nfsrc_freecache(rp);
  568         mtx_unlock(mutex);
  569 }
  570 
  571 /*
  572  * Called after nfsrvd_updatecache() once the reply is sent, to update
  573  * the entry's sequence number and unlock it. The argument is
  574  * the pointer returned by nfsrvd_updatecache().
  575  */
  576 void
  577 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
  578 {
  579         struct nfsrchash_bucket *hbp;
  580 
  581         KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
  582         if (have_seq) {
  583                 hbp = NFSRCAHASH(rp->rc_sockref);
  584                 mtx_lock(&hbp->mtx);
  585                 rp->rc_tcpseq = seq;
  586                 if (rp->rc_acked != RC_NO_ACK)
  587                         LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
  588                 rp->rc_acked = RC_NO_ACK;
  589                 mtx_unlock(&hbp->mtx);
  590         }
  591         nfsrc_unlock(rp);
  592 }
  593 
  594 /*
  595  * Get a cache entry for TCP
  596  * - key on <xid, nfs version>
  597  *   (allow multiple entries for a given key)
  598  */
  599 static int
  600 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  601 {
  602         struct nfsrvcache *rp, *nextrp;
  603         int i;
  604         struct nfsrvcache *hitrp;
  605         struct nfsrvhashhead *hp, nfsrc_templist;
  606         int hit, ret = 0;
  607         struct mtx *mutex;
  608 
  609         mutex = nfsrc_cachemutex(newrp);
  610         hp = NFSRCHASH(newrp->rc_xid);
  611         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
  612 tryagain:
  613         mtx_lock(mutex);
  614         hit = 1;
  615         LIST_INIT(&nfsrc_templist);
  616         /*
  617          * Get all the matches and put them on the temp list.
  618          */
  619         rp = LIST_FIRST(hp);
  620         while (rp != LIST_END(hp)) {
  621                 nextrp = LIST_NEXT(rp, rc_hash);
  622                 if (newrp->rc_xid == rp->rc_xid &&
  623                     (!(rp->rc_flag & RC_INPROG) ||
  624                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
  625                       newrp->rc_sockref == rp->rc_sockref)) &&
  626                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  627                     newrp->rc_proc == rp->rc_proc &&
  628                     ((newrp->rc_flag & RC_NFSV4) &&
  629                      newrp->rc_sockref != rp->rc_sockref &&
  630                      newrp->rc_cachetime >= rp->rc_cachetime)
  631                     && newrp->rc_reqlen == rp->rc_reqlen &&
  632                     newrp->rc_cksum == rp->rc_cksum) {
  633                         LIST_REMOVE(rp, rc_hash);
  634                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
  635                 }
  636                 rp = nextrp;
  637         }
  638 
  639         /*
  640          * Now, use nfsrc_templist to decide if there is a match.
  641          */
  642         i = 0;
  643         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
  644                 i++;
  645                 if (rp->rc_refcnt > 0) {
  646                         hit = 0;
  647                         break;
  648                 }
  649         }
  650         /*
  651          * Can be a hit only if one entry left.
  652          * Note possible hit entry and put nfsrc_templist back on hash
  653          * list.
  654          */
  655         if (i != 1)
  656                 hit = 0;
  657         hitrp = rp = LIST_FIRST(&nfsrc_templist);
  658         while (rp != LIST_END(&nfsrc_templist)) {
  659                 nextrp = LIST_NEXT(rp, rc_hash);
  660                 LIST_REMOVE(rp, rc_hash);
  661                 LIST_INSERT_HEAD(hp, rp, rc_hash);
  662                 rp = nextrp;
  663         }
  664         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
  665                 panic("nfs gettcp cache templist");
  666 
  667         if (hit) {
  668                 rp = hitrp;
  669                 if ((rp->rc_flag & RC_LOCKED) != 0) {
  670                         rp->rc_flag |= RC_WANTED;
  671                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  672                             "nfsrc", 10 * hz);
  673                         goto tryagain;
  674                 }
  675                 if (rp->rc_flag == 0)
  676                         panic("nfs tcp cache0");
  677                 rp->rc_flag |= RC_LOCKED;
  678                 if (rp->rc_flag & RC_INPROG) {
  679                         nfsstatsv1.srvcache_inproghits++;
  680                         mtx_unlock(mutex);
  681                         if (newrp->rc_sockref == rp->rc_sockref)
  682                                 nfsrc_marksametcpconn(rp->rc_sockref);
  683                         ret = RC_DROPIT;
  684                 } else if (rp->rc_flag & RC_REPSTATUS) {
  685                         /*
  686                          * V2 only.
  687                          */
  688                         nfsstatsv1.srvcache_nonidemdonehits++;
  689                         mtx_unlock(mutex);
  690                         if (newrp->rc_sockref == rp->rc_sockref)
  691                                 nfsrc_marksametcpconn(rp->rc_sockref);
  692                         ret = RC_REPLY;
  693                         nfsrvd_rephead(nd);
  694                         *(nd->nd_errp) = rp->rc_status;
  695                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  696                 } else if (rp->rc_flag & RC_REPMBUF) {
  697                         nfsstatsv1.srvcache_nonidemdonehits++;
  698                         mtx_unlock(mutex);
  699                         if (newrp->rc_sockref == rp->rc_sockref)
  700                                 nfsrc_marksametcpconn(rp->rc_sockref);
  701                         ret = RC_REPLY;
  702                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
  703                                 M_COPYALL, M_WAITOK);
  704                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  705                 } else {
  706                         panic("nfs tcp cache1");
  707                 }
  708                 nfsrc_unlock(rp);
  709                 free((caddr_t)newrp, M_NFSRVCACHE);
  710                 goto out;
  711         }
  712         nfsstatsv1.srvcache_misses++;
  713         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
  714 
  715         /*
  716          * For TCP, multiple entries for a key are allowed, so don't
  717          * chain it into the hash table until done.
  718          */
  719         newrp->rc_cachetime = NFSD_MONOSEC;
  720         newrp->rc_flag |= RC_INPROG;
  721         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  722         mtx_unlock(mutex);
  723         nd->nd_rp = newrp;
  724         ret = RC_DOIT;
  725 
  726 out:
  727         NFSEXITCODE2(0, nd);
  728         return (ret);
  729 }
  730 
  731 /*
  732  * Lock a cache entry.
  733  */
  734 static void
  735 nfsrc_lock(struct nfsrvcache *rp)
  736 {
  737         struct mtx *mutex;
  738 
  739         mutex = nfsrc_cachemutex(rp);
  740         mtx_assert(mutex, MA_OWNED);
  741         while ((rp->rc_flag & RC_LOCKED) != 0) {
  742                 rp->rc_flag |= RC_WANTED;
  743                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
  744         }
  745         rp->rc_flag |= RC_LOCKED;
  746 }
  747 
  748 /*
  749  * Unlock a cache entry.
  750  */
  751 static void
  752 nfsrc_unlock(struct nfsrvcache *rp)
  753 {
  754         struct mtx *mutex;
  755 
  756         mutex = nfsrc_cachemutex(rp);
  757         mtx_lock(mutex);
  758         rp->rc_flag &= ~RC_LOCKED;
  759         nfsrc_wanted(rp);
  760         mtx_unlock(mutex);
  761 }
  762 
  763 /*
  764  * Wakeup anyone wanting entry.
  765  */
  766 static void
  767 nfsrc_wanted(struct nfsrvcache *rp)
  768 {
  769         if (rp->rc_flag & RC_WANTED) {
  770                 rp->rc_flag &= ~RC_WANTED;
  771                 wakeup((caddr_t)rp);
  772         }
  773 }
  774 
  775 /*
  776  * Free up the entry.
  777  * Must not sleep.
  778  */
  779 static void
  780 nfsrc_freecache(struct nfsrvcache *rp)
  781 {
  782         struct nfsrchash_bucket *hbp;
  783 
  784         LIST_REMOVE(rp, rc_hash);
  785         if (rp->rc_flag & RC_UDP) {
  786                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  787                 nfsrc_udpcachesize--;
  788         } else if (rp->rc_acked != RC_NO_SEQ) {
  789                 hbp = NFSRCAHASH(rp->rc_sockref);
  790                 mtx_lock(&hbp->mtx);
  791                 if (rp->rc_acked == RC_NO_ACK)
  792                         LIST_REMOVE(rp, rc_ahash);
  793                 mtx_unlock(&hbp->mtx);
  794         }
  795         nfsrc_wanted(rp);
  796         if (rp->rc_flag & RC_REPMBUF) {
  797                 mbuf_freem(rp->rc_reply);
  798                 if (!(rp->rc_flag & RC_UDP))
  799                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
  800         }
  801         FREE((caddr_t)rp, M_NFSRVCACHE);
  802         atomic_add_int(&nfsstatsv1.srvcache_size, -1);
  803 }
  804 
  805 /*
  806  * Clean out the cache. Called when nfsserver module is unloaded.
  807  */
  808 void
  809 nfsrvd_cleancache(void)
  810 {
  811         struct nfsrvcache *rp, *nextrp;
  812         int i;
  813 
  814         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  815                 mtx_lock(&nfsrchash_table[i].mtx);
  816                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
  817                         nfsrc_freecache(rp);
  818                 mtx_unlock(&nfsrchash_table[i].mtx);
  819         }
  820         mtx_lock(&nfsrc_udpmtx);
  821         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  822                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
  823                         nfsrc_freecache(rp);
  824                 }
  825         }
  826         nfsstatsv1.srvcache_size = 0;
  827         mtx_unlock(&nfsrc_udpmtx);
  828         nfsrc_tcpsavedreplies = 0;
  829 }
  830 
  831 #define HISTSIZE        16
  832 /*
  833  * The basic rule is to get rid of entries that are expired.
  834  */
  835 void
  836 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
  837 {
  838         struct nfsrchash_bucket *hbp;
  839         struct nfsrvcache *rp, *nextrp;
  840         int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
  841         time_t thisstamp;
  842         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
  843         static int onethread = 0, oneslot = 0;
  844 
  845         if (sockref != 0) {
  846                 hbp = NFSRCAHASH(sockref);
  847                 mtx_lock(&hbp->mtx);
  848                 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
  849                         if (sockref == rp->rc_sockref) {
  850                                 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
  851                                         rp->rc_acked = RC_ACK;
  852                                         LIST_REMOVE(rp, rc_ahash);
  853                                 } else if (final) {
  854                                         rp->rc_acked = RC_NACK;
  855                                         LIST_REMOVE(rp, rc_ahash);
  856                                 }
  857                         }
  858                 }
  859                 mtx_unlock(&hbp->mtx);
  860         }
  861 
  862         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
  863                 return;
  864         if (NFSD_MONOSEC != udp_lasttrim ||
  865             nfsrc_udpcachesize >= (nfsrc_udphighwater +
  866             nfsrc_udphighwater / 2)) {
  867                 mtx_lock(&nfsrc_udpmtx);
  868                 udp_lasttrim = NFSD_MONOSEC;
  869                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
  870                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
  871                              && rp->rc_refcnt == 0
  872                              && ((rp->rc_flag & RC_REFCNT) ||
  873                                  udp_lasttrim > rp->rc_timestamp ||
  874                                  nfsrc_udpcachesize > nfsrc_udphighwater))
  875                                 nfsrc_freecache(rp);
  876                 }
  877                 mtx_unlock(&nfsrc_udpmtx);
  878         }
  879         if (NFSD_MONOSEC != tcp_lasttrim ||
  880             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
  881                 force = nfsrc_tcphighwater / 4;
  882                 if (force > 0 &&
  883                     nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
  884                         for (i = 0; i < HISTSIZE; i++)
  885                                 time_histo[i] = 0;
  886                         i = 0;
  887                         lastslot = NFSRVCACHE_HASHSIZE - 1;
  888                 } else {
  889                         force = 0;
  890                         if (NFSD_MONOSEC != tcp_lasttrim) {
  891                                 i = 0;
  892                                 lastslot = NFSRVCACHE_HASHSIZE - 1;
  893                         } else {
  894                                 lastslot = i = oneslot;
  895                                 if (++oneslot >= NFSRVCACHE_HASHSIZE)
  896                                         oneslot = 0;
  897                         }
  898                 }
  899                 tto = nfsrc_tcptimeout;
  900                 tcp_lasttrim = NFSD_MONOSEC;
  901                 for (; i <= lastslot; i++) {
  902                         mtx_lock(&nfsrchash_table[i].mtx);
  903                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
  904                             nextrp) {
  905                                 if (!(rp->rc_flag &
  906                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
  907                                      && rp->rc_refcnt == 0) {
  908                                         if ((rp->rc_flag & RC_REFCNT) ||
  909                                             tcp_lasttrim > rp->rc_timestamp ||
  910                                             rp->rc_acked == RC_ACK) {
  911                                                 nfsrc_freecache(rp);
  912                                                 continue;
  913                                         }
  914 
  915                                         if (force == 0)
  916                                                 continue;
  917                                         /*
  918                                          * The timestamps range from roughly the
  919                                          * present (tcp_lasttrim) to the present
  920                                          * + nfsrc_tcptimeout. Generate a simple
  921                                          * histogram of where the timeouts fall.
  922                                          */
  923                                         j = rp->rc_timestamp - tcp_lasttrim;
  924                                         if (j >= tto)
  925                                                 j = HISTSIZE - 1;
  926                                         else if (j < 0)
  927                                                 j = 0;
  928                                         else
  929                                                 j = j * HISTSIZE / tto;
  930                                         time_histo[j]++;
  931                                 }
  932                         }
  933                         mtx_unlock(&nfsrchash_table[i].mtx);
  934                 }
  935                 if (force) {
  936                         /*
  937                          * Trim some more with a smaller timeout of as little
  938                          * as 20% of nfsrc_tcptimeout to try and get below
  939                          * 80% of the nfsrc_tcphighwater.
  940                          */
  941                         k = 0;
  942                         for (i = 0; i < (HISTSIZE - 2); i++) {
  943                                 k += time_histo[i];
  944                                 if (k > force)
  945                                         break;
  946                         }
  947                         k = tto * (i + 1) / HISTSIZE;
  948                         if (k < 1)
  949                                 k = 1;
  950                         thisstamp = tcp_lasttrim + k;
  951                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  952                                 mtx_lock(&nfsrchash_table[i].mtx);
  953                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
  954                                     rc_hash, nextrp) {
  955                                         if (!(rp->rc_flag &
  956                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
  957                                              && rp->rc_refcnt == 0
  958                                              && ((rp->rc_flag & RC_REFCNT) ||
  959                                                  thisstamp > rp->rc_timestamp ||
  960                                                  rp->rc_acked == RC_ACK))
  961                                                 nfsrc_freecache(rp);
  962                                 }
  963                                 mtx_unlock(&nfsrchash_table[i].mtx);
  964                         }
  965                 }
  966         }
  967         atomic_store_rel_int(&onethread, 0);
  968 }
  969 
  970 /*
  971  * Add a seqid# reference to the cache entry.
  972  */
  973 void
  974 nfsrvd_refcache(struct nfsrvcache *rp)
  975 {
  976         struct mtx *mutex;
  977 
  978         if (rp == NULL)
  979                 /* For NFSv4.1, there is no cache entry. */
  980                 return;
  981         mutex = nfsrc_cachemutex(rp);
  982         mtx_lock(mutex);
  983         if (rp->rc_refcnt < 0)
  984                 panic("nfs cache refcnt");
  985         rp->rc_refcnt++;
  986         mtx_unlock(mutex);
  987 }
  988 
  989 /*
  990  * Dereference a seqid# cache entry.
  991  */
  992 void
  993 nfsrvd_derefcache(struct nfsrvcache *rp)
  994 {
  995         struct mtx *mutex;
  996 
  997         mutex = nfsrc_cachemutex(rp);
  998         mtx_lock(mutex);
  999         if (rp->rc_refcnt <= 0)
 1000                 panic("nfs cache derefcnt");
 1001         rp->rc_refcnt--;
 1002         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
 1003                 nfsrc_freecache(rp);
 1004         mtx_unlock(mutex);
 1005 }
 1006 
 1007 /*
 1008  * Calculate the length of the mbuf list and a checksum on the first up to
 1009  * NFSRVCACHE_CHECKLEN bytes.
 1010  */
 1011 static int
 1012 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
 1013 {
 1014         int len = 0, cklen;
 1015         mbuf_t m;
 1016 
 1017         m = m1;
 1018         while (m) {
 1019                 len += mbuf_len(m);
 1020                 m = mbuf_next(m);
 1021         }
 1022         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
 1023         *cksum = in_cksum(m1, cklen);
 1024         return (len);
 1025 }
 1026 
 1027 /*
 1028  * Mark a TCP connection that is seeing retries. Should never happen for
 1029  * NFSv4.
 1030  */
 1031 static void
 1032 nfsrc_marksametcpconn(u_int64_t sockref)
 1033 {
 1034 }
 1035
Cache object: d88c7d8ff5ae371f6238e9a880f1d1af
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/fs/nfsserver/nfs_nfsdcache.c

FreeBSD/Linux Kernel Cross Reference
sys/fs/nfsserver/nfs_nfsdcache.c