nfs_nfsdcache.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Rick Macklem at The University of Guelph.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  */
   35 
   36 #include <sys/cdefs.h>
   37 __FBSDID("$FreeBSD$");
   38 
   39 /*
   40  * Here is the basic algorithm:
   41  * First, some design criteria I used:
   42  * - I think a false hit is more serious than a false miss
   43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
   44  *   avoided at all cost
   45  * - A valid hit will probably happen a long time after the original reply
   46  *   and the TCP socket that the original request was received on will no
   47  *   longer be active
   48  *   (The long time delay implies to me that LRU is not appropriate.)
   49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
   50  *   in them as well as minimizing the risk of redoing retried non-idempotent
   51  *   Ops.
   52  * Because it is biased towards avoiding false hits, multiple entries with
   53  * the same xid are to be expected, especially for the case of the entry
   54  * in the cache being related to a seqid# sequenced Op.
   55  * 
   56  * The basic algorithm I'm about to code up:
   57  * - Null RPCs bypass the cache and are just done
   58  * For TCP
   59  *      - key on <xid, NFS version> (as noted above, there can be several
   60  *                                   entries with the same key)
   61  *      When a request arrives:
   62  *              For all that match key
   63  *              - if RPC# != OR request_size !=
   64  *                      - not a match with this one
   65  *              - if NFSv4 and received on same TCP socket OR
   66  *                      received on a TCP connection created before the
   67  *                      entry was cached
   68  *                      - not a match with this one
   69  *                      (V2,3 clients might retry on same TCP socket)
   70  *              - calculate checksum on first N bytes of NFS XDR
   71  *              - if checksum !=
   72  *                      - not a match for this one
   73  *              If any of the remaining ones that match has a
   74  *                      seqid_refcnt > 0
   75  *                      - not a match (go do RPC, using new cache entry)
   76  *              If one match left
   77  *                      - a hit (reply from cache)
   78  *              else
   79  *                      - miss (go do RPC, using new cache entry)
   80  * 
   81  *      During processing of NFSv4 request:
   82  *              - set a flag when a non-idempotent Op is processed
   83  *              - when an Op that uses a seqid# (Open,...) is processed
   84  *                      - if same seqid# as referenced entry in cache
   85  *                              - free new cache entry
   86  *                              - reply from referenced cache entry
   87  *                        else if next seqid# in order
   88  *                              - free referenced cache entry
   89  *                              - increment seqid_refcnt on new cache entry
   90  *                              - set pointer from Openowner/Lockowner to
   91  *                                      new cache entry (aka reference it)
   92  *                        else if first seqid# in sequence
   93  *                              - increment seqid_refcnt on new cache entry
   94  *                              - set pointer from Openowner/Lockowner to
   95  *                                      new cache entry (aka reference it)
   96  * 
   97  *      At end of RPC processing:
   98  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
   99  *                      cache entry
  100  *                      - save reply in cache entry
  101  *                      - calculate checksum on first N bytes of NFS XDR
  102  *                              request
  103  *                      - note op and length of XDR request (in bytes)
  104  *                      - timestamp it
  105  *                else
  106  *                      - free new cache entry
  107  *              - Send reply (noting info for socket activity check, below)
  108  * 
  109  *      For cache entries saved above:
  110  *              - if saved since seqid_refcnt was > 0
  111  *                      - free when seqid_refcnt decrements to 0
  112  *                        (when next one in sequence is processed above, or
  113  *                         when Openowner/Lockowner is discarded)
  114  *                else { non-idempotent Op(s) }
  115  *                      - free when
  116  *                              - some further activity observed on same
  117  *                                      socket
  118  *                                (I'm not yet sure how I'm going to do
  119  *                                 this. Maybe look at the TCP connection
  120  *                                 to see if the send_tcp_sequence# is well
  121  *                                 past sent reply OR K additional RPCs
  122  *                                 replied on same socket OR?)
  123  *                        OR
  124  *                              - when very old (hours, days, weeks?)
  125  * 
  126  * For UDP (v2, 3 only), pretty much the old way:
  127  * - key on <xid, NFS version, RPC#, Client host ip#>
  128  *   (at most one entry for each key)
  129  * 
  130  * When a Request arrives:
  131  * - if a match with entry via key
  132  *      - if RPC marked In_progress
  133  *              - discard request (don't send reply)
  134  *        else
  135  *              - reply from cache
  136  *              - timestamp cache entry
  137  *   else
  138  *      - add entry to cache, marked In_progress
  139  *      - do RPC
  140  *      - when RPC done
  141  *              - if RPC# non-idempotent
  142  *                      - mark entry Done (not In_progress)
  143  *                      - save reply
  144  *                      - timestamp cache entry
  145  *                else
  146  *                      - free cache entry
  147  *              - send reply
  148  * 
  149  * Later, entries with saved replies are free'd a short time (few minutes)
  150  * after reply sent (timestamp).
  151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
  152  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
  153  *              pages 53-63. San Diego, February 1989.
  154  *       for the UDP case.
  155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
  156  *      for TCP. For V3, a reply won't be saved when the flood level is
  157  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
  158  *      that case. This level should be set high enough that this almost
  159  *      never happens.
  160  */
  161 #include <fs/nfs/nfsport.h>
  162 
  163 extern struct nfsstatsv1 nfsstatsv1;
  164 extern struct mtx nfsrc_udpmtx;
  165 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
  166 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
  167 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
  168 
  169 SYSCTL_DECL(_vfs_nfsd);
  170 
  171 static u_int    nfsrc_tcphighwater = 0;
  172 static int
  173 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
  174 {
  175         int error, newhighwater;
  176 
  177         newhighwater = nfsrc_tcphighwater;
  178         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
  179         if (error != 0 || req->newptr == NULL)
  180                 return (error);
  181         if (newhighwater < 0)
  182                 return (EINVAL);
  183         if (newhighwater >= nfsrc_floodlevel)
  184                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
  185         nfsrc_tcphighwater = newhighwater;
  186         return (0);
  187 }
  188 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
  189     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
  190     sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
  191 
  192 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
  193 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
  194     &nfsrc_udphighwater, 0,
  195     "High water mark for UDP cache entries");
  196 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
  197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
  198     &nfsrc_tcptimeout, 0,
  199     "Timeout for TCP entries in the DRC");
  200 static u_int nfsrc_tcpnonidempotent = 1;
  201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
  202     &nfsrc_tcpnonidempotent, 0,
  203     "Enable the DRC for NFS over TCP");
  204 
  205 static int nfsrc_udpcachesize = 0;
  206 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
  207 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
  208 
  209 /*
  210  * and the reverse mapping from generic to Version 2 procedure numbers
  211  */
  212 static int newnfsv2_procid[NFS_V3NPROCS] = {
  213         NFSV2PROC_NULL,
  214         NFSV2PROC_GETATTR,
  215         NFSV2PROC_SETATTR,
  216         NFSV2PROC_LOOKUP,
  217         NFSV2PROC_NOOP,
  218         NFSV2PROC_READLINK,
  219         NFSV2PROC_READ,
  220         NFSV2PROC_WRITE,
  221         NFSV2PROC_CREATE,
  222         NFSV2PROC_MKDIR,
  223         NFSV2PROC_SYMLINK,
  224         NFSV2PROC_CREATE,
  225         NFSV2PROC_REMOVE,
  226         NFSV2PROC_RMDIR,
  227         NFSV2PROC_RENAME,
  228         NFSV2PROC_LINK,
  229         NFSV2PROC_READDIR,
  230         NFSV2PROC_NOOP,
  231         NFSV2PROC_STATFS,
  232         NFSV2PROC_NOOP,
  233         NFSV2PROC_NOOP,
  234         NFSV2PROC_NOOP,
  235 };
  236 
  237 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
  238 #define NFSRCUDPHASH(xid) \
  239         (&nfsrvudphashtbl[nfsrc_hash(xid)])
  240 #define NFSRCHASH(xid) \
  241         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
  242 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
  243 #define TRUE    1
  244 #define FALSE   0
  245 #define NFSRVCACHE_CHECKLEN     100
  246 
  247 /* True iff the rpc reply is an nfs status ONLY! */
  248 static int nfsv2_repstat[NFS_V3NPROCS] = {
  249         FALSE,
  250         FALSE,
  251         FALSE,
  252         FALSE,
  253         FALSE,
  254         FALSE,
  255         FALSE,
  256         FALSE,
  257         FALSE,
  258         FALSE,
  259         TRUE,
  260         TRUE,
  261         TRUE,
  262         TRUE,
  263         FALSE,
  264         TRUE,
  265         FALSE,
  266         FALSE,
  267         FALSE,
  268         FALSE,
  269         FALSE,
  270         FALSE,
  271 };
  272 
  273 /*
  274  * Will NFS want to work over IPv6 someday?
  275  */
  276 #define NETFAMILY(rp) \
  277                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
  278 
  279 /* local functions */
  280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  282 static void nfsrc_lock(struct nfsrvcache *rp);
  283 static void nfsrc_unlock(struct nfsrvcache *rp);
  284 static void nfsrc_wanted(struct nfsrvcache *rp);
  285 static void nfsrc_freecache(struct nfsrvcache *rp);
  286 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
  287 static void nfsrc_marksametcpconn(u_int64_t);
  288 
  289 /*
  290  * Return the correct mutex for this cache entry.
  291  */
  292 static __inline struct mtx *
  293 nfsrc_cachemutex(struct nfsrvcache *rp)
  294 {
  295 
  296         if ((rp->rc_flag & RC_UDP) != 0)
  297                 return (&nfsrc_udpmtx);
  298         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
  299 }
  300 
  301 /*
  302  * Initialize the server request cache list
  303  */
  304 void
  305 nfsrvd_initcache(void)
  306 {
  307         int i;
  308         static int inited = 0;
  309 
  310         if (inited)
  311                 return;
  312         inited = 1;
  313         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  314                 LIST_INIT(&nfsrvudphashtbl[i]);
  315                 LIST_INIT(&nfsrchash_table[i].tbl);
  316                 LIST_INIT(&nfsrcahash_table[i].tbl);
  317         }
  318         TAILQ_INIT(&nfsrvudplru);
  319         nfsrc_tcpsavedreplies = 0;
  320         nfsrc_udpcachesize = 0;
  321         nfsstatsv1.srvcache_tcppeak = 0;
  322         nfsstatsv1.srvcache_size = 0;
  323 }
  324 
  325 /*
  326  * Get a cache entry for this request. Basically just malloc a new one
  327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
  328  */
  329 int
  330 nfsrvd_getcache(struct nfsrv_descript *nd)
  331 {
  332         struct nfsrvcache *newrp;
  333         int ret;
  334 
  335         if (nd->nd_procnum == NFSPROC_NULL)
  336                 panic("nfsd cache null");
  337         newrp = malloc(sizeof (struct nfsrvcache),
  338             M_NFSRVCACHE, M_WAITOK);
  339         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
  340         if (nd->nd_flag & ND_NFSV4)
  341                 newrp->rc_flag = RC_NFSV4;
  342         else if (nd->nd_flag & ND_NFSV3)
  343                 newrp->rc_flag = RC_NFSV3;
  344         else
  345                 newrp->rc_flag = RC_NFSV2;
  346         newrp->rc_xid = nd->nd_retxid;
  347         newrp->rc_proc = nd->nd_procnum;
  348         newrp->rc_sockref = nd->nd_sockref;
  349         newrp->rc_cachetime = nd->nd_tcpconntime;
  350         if (nd->nd_flag & ND_SAMETCPCONN)
  351                 newrp->rc_flag |= RC_SAMETCPCONN;
  352         if (nd->nd_nam2 != NULL) {
  353                 newrp->rc_flag |= RC_UDP;
  354                 ret = nfsrc_getudp(nd, newrp);
  355         } else {
  356                 ret = nfsrc_gettcp(nd, newrp);
  357         }
  358         NFSEXITCODE2(0, nd);
  359         return (ret);
  360 }
  361 
  362 /*
  363  * For UDP (v2, v3):
  364  * - key on <xid, NFS version, RPC#, Client host ip#>
  365  *   (at most one entry for each key)
  366  */
  367 static int
  368 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  369 {
  370         struct nfsrvcache *rp;
  371         struct sockaddr_in *saddr;
  372         struct sockaddr_in6 *saddr6;
  373         struct nfsrvhashhead *hp;
  374         int ret = 0;
  375         struct mtx *mutex;
  376 
  377         mutex = nfsrc_cachemutex(newrp);
  378         hp = NFSRCUDPHASH(newrp->rc_xid);
  379 loop:
  380         mtx_lock(mutex);
  381         LIST_FOREACH(rp, hp, rc_hash) {
  382             if (newrp->rc_xid == rp->rc_xid &&
  383                 newrp->rc_proc == rp->rc_proc &&
  384                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  385                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
  386                         if ((rp->rc_flag & RC_LOCKED) != 0) {
  387                                 rp->rc_flag |= RC_WANTED;
  388                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  389                                     "nfsrc", 10 * hz);
  390                                 goto loop;
  391                         }
  392                         if (rp->rc_flag == 0)
  393                                 panic("nfs udp cache0");
  394                         rp->rc_flag |= RC_LOCKED;
  395                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  396                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  397                         if (rp->rc_flag & RC_INPROG) {
  398                                 nfsstatsv1.srvcache_inproghits++;
  399                                 mtx_unlock(mutex);
  400                                 ret = RC_DROPIT;
  401                         } else if (rp->rc_flag & RC_REPSTATUS) {
  402                                 /*
  403                                  * V2 only.
  404                                  */
  405                                 nfsstatsv1.srvcache_nonidemdonehits++;
  406                                 mtx_unlock(mutex);
  407                                 nfsrvd_rephead(nd);
  408                                 *(nd->nd_errp) = rp->rc_status;
  409                                 ret = RC_REPLY;
  410                                 rp->rc_timestamp = NFSD_MONOSEC +
  411                                         NFSRVCACHE_UDPTIMEOUT;
  412                         } else if (rp->rc_flag & RC_REPMBUF) {
  413                                 nfsstatsv1.srvcache_nonidemdonehits++;
  414                                 mtx_unlock(mutex);
  415                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  416                                         M_COPYALL, M_WAITOK);
  417                                 ret = RC_REPLY;
  418                                 rp->rc_timestamp = NFSD_MONOSEC +
  419                                         NFSRVCACHE_UDPTIMEOUT;
  420                         } else {
  421                                 panic("nfs udp cache1");
  422                         }
  423                         nfsrc_unlock(rp);
  424                         free(newrp, M_NFSRVCACHE);
  425                         goto out;
  426                 }
  427         }
  428         nfsstatsv1.srvcache_misses++;
  429         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
  430         nfsrc_udpcachesize++;
  431 
  432         newrp->rc_flag |= RC_INPROG;
  433         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
  434         if (saddr->sin_family == AF_INET)
  435                 newrp->rc_inet = saddr->sin_addr.s_addr;
  436         else if (saddr->sin_family == AF_INET6) {
  437                 saddr6 = (struct sockaddr_in6 *)saddr;
  438                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
  439                     sizeof (struct in6_addr));
  440                 newrp->rc_flag |= RC_INETIPV6;
  441         }
  442         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  443         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
  444         mtx_unlock(mutex);
  445         nd->nd_rp = newrp;
  446         ret = RC_DOIT;
  447 
  448 out:
  449         NFSEXITCODE2(0, nd);
  450         return (ret);
  451 }
  452 
  453 /*
  454  * Update a request cache entry after the rpc has been done
  455  */
  456 struct nfsrvcache *
  457 nfsrvd_updatecache(struct nfsrv_descript *nd)
  458 {
  459         struct nfsrvcache *rp;
  460         struct nfsrvcache *retrp = NULL;
  461         struct mbuf *m;
  462         struct mtx *mutex;
  463 
  464         rp = nd->nd_rp;
  465         if (!rp)
  466                 panic("nfsrvd_updatecache null rp");
  467         nd->nd_rp = NULL;
  468         mutex = nfsrc_cachemutex(rp);
  469         mtx_lock(mutex);
  470         nfsrc_lock(rp);
  471         if (!(rp->rc_flag & RC_INPROG))
  472                 panic("nfsrvd_updatecache not inprog");
  473         rp->rc_flag &= ~RC_INPROG;
  474         if (rp->rc_flag & RC_UDP) {
  475                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  476                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  477         }
  478 
  479         /*
  480          * Reply from cache is a special case returned by nfsrv_checkseqid().
  481          */
  482         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
  483                 nfsstatsv1.srvcache_nonidemdonehits++;
  484                 mtx_unlock(mutex);
  485                 nd->nd_repstat = 0;
  486                 if (nd->nd_mreq)
  487                         m_freem(nd->nd_mreq);
  488                 if (!(rp->rc_flag & RC_REPMBUF))
  489                         panic("reply from cache");
  490                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  491                     M_COPYALL, M_WAITOK);
  492                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  493                 nfsrc_unlock(rp);
  494                 goto out;
  495         }
  496 
  497         /*
  498          * If rc_refcnt > 0, save it
  499          * For UDP, save it if ND_SAVEREPLY is set
  500          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
  501          */
  502         if (nd->nd_repstat != NFSERR_DONTREPLY &&
  503             (rp->rc_refcnt > 0 ||
  504              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
  505              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
  506               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
  507               nfsrc_tcpnonidempotent))) {
  508                 if (rp->rc_refcnt > 0) {
  509                         if (!(rp->rc_flag & RC_NFSV4))
  510                                 panic("update_cache refcnt");
  511                         rp->rc_flag |= RC_REFCNT;
  512                 }
  513                 if ((nd->nd_flag & ND_NFSV2) &&
  514                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
  515                         rp->rc_status = nd->nd_repstat;
  516                         rp->rc_flag |= RC_REPSTATUS;
  517                         mtx_unlock(mutex);
  518                 } else {
  519                         if (!(rp->rc_flag & RC_UDP)) {
  520                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
  521                             if (nfsrc_tcpsavedreplies >
  522                                 nfsstatsv1.srvcache_tcppeak)
  523                                 nfsstatsv1.srvcache_tcppeak =
  524                                     nfsrc_tcpsavedreplies;
  525                         }
  526                         mtx_unlock(mutex);
  527                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
  528                         mtx_lock(mutex);
  529                         rp->rc_reply = m;
  530                         rp->rc_flag |= RC_REPMBUF;
  531                         mtx_unlock(mutex);
  532                 }
  533                 if (rp->rc_flag & RC_UDP) {
  534                         rp->rc_timestamp = NFSD_MONOSEC +
  535                             NFSRVCACHE_UDPTIMEOUT;
  536                         nfsrc_unlock(rp);
  537                 } else {
  538                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  539                         if (rp->rc_refcnt > 0)
  540                                 nfsrc_unlock(rp);
  541                         else
  542                                 retrp = rp;
  543                 }
  544         } else {
  545                 nfsrc_freecache(rp);
  546                 mtx_unlock(mutex);
  547         }
  548 
  549 out:
  550         NFSEXITCODE2(0, nd);
  551         return (retrp);
  552 }
  553 
  554 /*
  555  * Invalidate and, if possible, free an in prog cache entry.
  556  * Must not sleep.
  557  */
  558 void
  559 nfsrvd_delcache(struct nfsrvcache *rp)
  560 {
  561         struct mtx *mutex;
  562 
  563         mutex = nfsrc_cachemutex(rp);
  564         if (!(rp->rc_flag & RC_INPROG))
  565                 panic("nfsrvd_delcache not in prog");
  566         mtx_lock(mutex);
  567         rp->rc_flag &= ~RC_INPROG;
  568         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
  569                 nfsrc_freecache(rp);
  570         mtx_unlock(mutex);
  571 }
  572 
  573 /*
  574  * Called after nfsrvd_updatecache() once the reply is sent, to update
  575  * the entry's sequence number and unlock it. The argument is
  576  * the pointer returned by nfsrvd_updatecache().
  577  */
  578 void
  579 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
  580 {
  581         struct nfsrchash_bucket *hbp;
  582 
  583         KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
  584         if (have_seq) {
  585                 hbp = NFSRCAHASH(rp->rc_sockref);
  586                 mtx_lock(&hbp->mtx);
  587                 rp->rc_tcpseq = seq;
  588                 if (rp->rc_acked != RC_NO_ACK)
  589                         LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
  590                 rp->rc_acked = RC_NO_ACK;
  591                 mtx_unlock(&hbp->mtx);
  592         }
  593         nfsrc_unlock(rp);
  594 }
  595 
  596 /*
  597  * Get a cache entry for TCP
  598  * - key on <xid, nfs version>
  599  *   (allow multiple entries for a given key)
  600  */
  601 static int
  602 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  603 {
  604         struct nfsrvcache *rp, *nextrp;
  605         int i;
  606         struct nfsrvcache *hitrp;
  607         struct nfsrvhashhead *hp, nfsrc_templist;
  608         int hit, ret = 0;
  609         struct mtx *mutex;
  610 
  611         mutex = nfsrc_cachemutex(newrp);
  612         hp = NFSRCHASH(newrp->rc_xid);
  613         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
  614 tryagain:
  615         mtx_lock(mutex);
  616         hit = 1;
  617         LIST_INIT(&nfsrc_templist);
  618         /*
  619          * Get all the matches and put them on the temp list.
  620          */
  621         rp = LIST_FIRST(hp);
  622         while (rp != LIST_END(hp)) {
  623                 nextrp = LIST_NEXT(rp, rc_hash);
  624                 if (newrp->rc_xid == rp->rc_xid &&
  625                     (!(rp->rc_flag & RC_INPROG) ||
  626                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
  627                       newrp->rc_sockref == rp->rc_sockref)) &&
  628                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  629                     newrp->rc_proc == rp->rc_proc &&
  630                     ((newrp->rc_flag & RC_NFSV4) &&
  631                      newrp->rc_sockref != rp->rc_sockref &&
  632                      newrp->rc_cachetime >= rp->rc_cachetime)
  633                     && newrp->rc_reqlen == rp->rc_reqlen &&
  634                     newrp->rc_cksum == rp->rc_cksum) {
  635                         LIST_REMOVE(rp, rc_hash);
  636                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
  637                 }
  638                 rp = nextrp;
  639         }
  640 
  641         /*
  642          * Now, use nfsrc_templist to decide if there is a match.
  643          */
  644         i = 0;
  645         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
  646                 i++;
  647                 if (rp->rc_refcnt > 0) {
  648                         hit = 0;
  649                         break;
  650                 }
  651         }
  652         /*
  653          * Can be a hit only if one entry left.
  654          * Note possible hit entry and put nfsrc_templist back on hash
  655          * list.
  656          */
  657         if (i != 1)
  658                 hit = 0;
  659         hitrp = rp = LIST_FIRST(&nfsrc_templist);
  660         while (rp != LIST_END(&nfsrc_templist)) {
  661                 nextrp = LIST_NEXT(rp, rc_hash);
  662                 LIST_REMOVE(rp, rc_hash);
  663                 LIST_INSERT_HEAD(hp, rp, rc_hash);
  664                 rp = nextrp;
  665         }
  666         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
  667                 panic("nfs gettcp cache templist");
  668 
  669         if (hit) {
  670                 rp = hitrp;
  671                 if ((rp->rc_flag & RC_LOCKED) != 0) {
  672                         rp->rc_flag |= RC_WANTED;
  673                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  674                             "nfsrc", 10 * hz);
  675                         goto tryagain;
  676                 }
  677                 if (rp->rc_flag == 0)
  678                         panic("nfs tcp cache0");
  679                 rp->rc_flag |= RC_LOCKED;
  680                 if (rp->rc_flag & RC_INPROG) {
  681                         nfsstatsv1.srvcache_inproghits++;
  682                         mtx_unlock(mutex);
  683                         if (newrp->rc_sockref == rp->rc_sockref)
  684                                 nfsrc_marksametcpconn(rp->rc_sockref);
  685                         ret = RC_DROPIT;
  686                 } else if (rp->rc_flag & RC_REPSTATUS) {
  687                         /*
  688                          * V2 only.
  689                          */
  690                         nfsstatsv1.srvcache_nonidemdonehits++;
  691                         mtx_unlock(mutex);
  692                         if (newrp->rc_sockref == rp->rc_sockref)
  693                                 nfsrc_marksametcpconn(rp->rc_sockref);
  694                         ret = RC_REPLY;
  695                         nfsrvd_rephead(nd);
  696                         *(nd->nd_errp) = rp->rc_status;
  697                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  698                 } else if (rp->rc_flag & RC_REPMBUF) {
  699                         nfsstatsv1.srvcache_nonidemdonehits++;
  700                         mtx_unlock(mutex);
  701                         if (newrp->rc_sockref == rp->rc_sockref)
  702                                 nfsrc_marksametcpconn(rp->rc_sockref);
  703                         ret = RC_REPLY;
  704                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
  705                                 M_COPYALL, M_WAITOK);
  706                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  707                 } else {
  708                         panic("nfs tcp cache1");
  709                 }
  710                 nfsrc_unlock(rp);
  711                 free(newrp, M_NFSRVCACHE);
  712                 goto out;
  713         }
  714         nfsstatsv1.srvcache_misses++;
  715         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
  716 
  717         /*
  718          * For TCP, multiple entries for a key are allowed, so don't
  719          * chain it into the hash table until done.
  720          */
  721         newrp->rc_cachetime = NFSD_MONOSEC;
  722         newrp->rc_flag |= RC_INPROG;
  723         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  724         mtx_unlock(mutex);
  725         nd->nd_rp = newrp;
  726         ret = RC_DOIT;
  727 
  728 out:
  729         NFSEXITCODE2(0, nd);
  730         return (ret);
  731 }
  732 
  733 /*
  734  * Lock a cache entry.
  735  */
  736 static void
  737 nfsrc_lock(struct nfsrvcache *rp)
  738 {
  739         struct mtx *mutex;
  740 
  741         mutex = nfsrc_cachemutex(rp);
  742         mtx_assert(mutex, MA_OWNED);
  743         while ((rp->rc_flag & RC_LOCKED) != 0) {
  744                 rp->rc_flag |= RC_WANTED;
  745                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
  746         }
  747         rp->rc_flag |= RC_LOCKED;
  748 }
  749 
  750 /*
  751  * Unlock a cache entry.
  752  */
  753 static void
  754 nfsrc_unlock(struct nfsrvcache *rp)
  755 {
  756         struct mtx *mutex;
  757 
  758         mutex = nfsrc_cachemutex(rp);
  759         mtx_lock(mutex);
  760         rp->rc_flag &= ~RC_LOCKED;
  761         nfsrc_wanted(rp);
  762         mtx_unlock(mutex);
  763 }
  764 
  765 /*
  766  * Wakeup anyone wanting entry.
  767  */
  768 static void
  769 nfsrc_wanted(struct nfsrvcache *rp)
  770 {
  771         if (rp->rc_flag & RC_WANTED) {
  772                 rp->rc_flag &= ~RC_WANTED;
  773                 wakeup((caddr_t)rp);
  774         }
  775 }
  776 
  777 /*
  778  * Free up the entry.
  779  * Must not sleep.
  780  */
  781 static void
  782 nfsrc_freecache(struct nfsrvcache *rp)
  783 {
  784         struct nfsrchash_bucket *hbp;
  785 
  786         LIST_REMOVE(rp, rc_hash);
  787         if (rp->rc_flag & RC_UDP) {
  788                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  789                 nfsrc_udpcachesize--;
  790         } else if (rp->rc_acked != RC_NO_SEQ) {
  791                 hbp = NFSRCAHASH(rp->rc_sockref);
  792                 mtx_lock(&hbp->mtx);
  793                 if (rp->rc_acked == RC_NO_ACK)
  794                         LIST_REMOVE(rp, rc_ahash);
  795                 mtx_unlock(&hbp->mtx);
  796         }
  797         nfsrc_wanted(rp);
  798         if (rp->rc_flag & RC_REPMBUF) {
  799                 m_freem(rp->rc_reply);
  800                 if (!(rp->rc_flag & RC_UDP))
  801                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
  802         }
  803         free(rp, M_NFSRVCACHE);
  804         atomic_add_int(&nfsstatsv1.srvcache_size, -1);
  805 }
  806 
  807 /*
  808  * Clean out the cache. Called when nfsserver module is unloaded.
  809  */
  810 void
  811 nfsrvd_cleancache(void)
  812 {
  813         struct nfsrvcache *rp, *nextrp;
  814         int i;
  815 
  816         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  817                 mtx_lock(&nfsrchash_table[i].mtx);
  818                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
  819                         nfsrc_freecache(rp);
  820                 mtx_unlock(&nfsrchash_table[i].mtx);
  821         }
  822         mtx_lock(&nfsrc_udpmtx);
  823         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  824                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
  825                         nfsrc_freecache(rp);
  826                 }
  827         }
  828         nfsstatsv1.srvcache_size = 0;
  829         mtx_unlock(&nfsrc_udpmtx);
  830         nfsrc_tcpsavedreplies = 0;
  831 }
  832 
  833 #define HISTSIZE        16
  834 /*
  835  * The basic rule is to get rid of entries that are expired.
  836  */
  837 void
  838 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
  839 {
  840         struct nfsrchash_bucket *hbp;
  841         struct nfsrvcache *rp, *nextrp;
  842         int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
  843         time_t thisstamp;
  844         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
  845         static int onethread = 0, oneslot = 0;
  846 
  847         if (sockref != 0) {
  848                 hbp = NFSRCAHASH(sockref);
  849                 mtx_lock(&hbp->mtx);
  850                 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
  851                         if (sockref == rp->rc_sockref) {
  852                                 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
  853                                         rp->rc_acked = RC_ACK;
  854                                         LIST_REMOVE(rp, rc_ahash);
  855                                 } else if (final) {
  856                                         rp->rc_acked = RC_NACK;
  857                                         LIST_REMOVE(rp, rc_ahash);
  858                                 }
  859                         }
  860                 }
  861                 mtx_unlock(&hbp->mtx);
  862         }
  863 
  864         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
  865                 return;
  866         if (NFSD_MONOSEC != udp_lasttrim ||
  867             nfsrc_udpcachesize >= (nfsrc_udphighwater +
  868             nfsrc_udphighwater / 2)) {
  869                 mtx_lock(&nfsrc_udpmtx);
  870                 udp_lasttrim = NFSD_MONOSEC;
  871                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
  872                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
  873                              && rp->rc_refcnt == 0
  874                              && ((rp->rc_flag & RC_REFCNT) ||
  875                                  udp_lasttrim > rp->rc_timestamp ||
  876                                  nfsrc_udpcachesize > nfsrc_udphighwater))
  877                                 nfsrc_freecache(rp);
  878                 }
  879                 mtx_unlock(&nfsrc_udpmtx);
  880         }
  881         if (NFSD_MONOSEC != tcp_lasttrim ||
  882             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
  883                 force = nfsrc_tcphighwater / 4;
  884                 if (force > 0 &&
  885                     nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
  886                         for (i = 0; i < HISTSIZE; i++)
  887                                 time_histo[i] = 0;
  888                         i = 0;
  889                         lastslot = NFSRVCACHE_HASHSIZE - 1;
  890                 } else {
  891                         force = 0;
  892                         if (NFSD_MONOSEC != tcp_lasttrim) {
  893                                 i = 0;
  894                                 lastslot = NFSRVCACHE_HASHSIZE - 1;
  895                         } else {
  896                                 lastslot = i = oneslot;
  897                                 if (++oneslot >= NFSRVCACHE_HASHSIZE)
  898                                         oneslot = 0;
  899                         }
  900                 }
  901                 tto = nfsrc_tcptimeout;
  902                 tcp_lasttrim = NFSD_MONOSEC;
  903                 for (; i <= lastslot; i++) {
  904                         mtx_lock(&nfsrchash_table[i].mtx);
  905                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
  906                             nextrp) {
  907                                 if (!(rp->rc_flag &
  908                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
  909                                      && rp->rc_refcnt == 0) {
  910                                         if ((rp->rc_flag & RC_REFCNT) ||
  911                                             tcp_lasttrim > rp->rc_timestamp ||
  912                                             rp->rc_acked == RC_ACK) {
  913                                                 nfsrc_freecache(rp);
  914                                                 continue;
  915                                         }
  916 
  917                                         if (force == 0)
  918                                                 continue;
  919                                         /*
  920                                          * The timestamps range from roughly the
  921                                          * present (tcp_lasttrim) to the present
  922                                          * + nfsrc_tcptimeout. Generate a simple
  923                                          * histogram of where the timeouts fall.
  924                                          */
  925                                         j = rp->rc_timestamp - tcp_lasttrim;
  926                                         if (j >= tto)
  927                                                 j = HISTSIZE - 1;
  928                                         else if (j < 0)
  929                                                 j = 0;
  930                                         else
  931                                                 j = j * HISTSIZE / tto;
  932                                         time_histo[j]++;
  933                                 }
  934                         }
  935                         mtx_unlock(&nfsrchash_table[i].mtx);
  936                 }
  937                 if (force) {
  938                         /*
  939                          * Trim some more with a smaller timeout of as little
  940                          * as 20% of nfsrc_tcptimeout to try and get below
  941                          * 80% of the nfsrc_tcphighwater.
  942                          */
  943                         k = 0;
  944                         for (i = 0; i < (HISTSIZE - 2); i++) {
  945                                 k += time_histo[i];
  946                                 if (k > force)
  947                                         break;
  948                         }
  949                         k = tto * (i + 1) / HISTSIZE;
  950                         if (k < 1)
  951                                 k = 1;
  952                         thisstamp = tcp_lasttrim + k;
  953                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  954                                 mtx_lock(&nfsrchash_table[i].mtx);
  955                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
  956                                     rc_hash, nextrp) {
  957                                         if (!(rp->rc_flag &
  958                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
  959                                              && rp->rc_refcnt == 0
  960                                              && ((rp->rc_flag & RC_REFCNT) ||
  961                                                  thisstamp > rp->rc_timestamp ||
  962                                                  rp->rc_acked == RC_ACK))
  963                                                 nfsrc_freecache(rp);
  964                                 }
  965                                 mtx_unlock(&nfsrchash_table[i].mtx);
  966                         }
  967                 }
  968         }
  969         atomic_store_rel_int(&onethread, 0);
  970 }
  971 
  972 /*
  973  * Add a seqid# reference to the cache entry.
  974  */
  975 void
  976 nfsrvd_refcache(struct nfsrvcache *rp)
  977 {
  978         struct mtx *mutex;
  979 
  980         if (rp == NULL)
  981                 /* For NFSv4.1, there is no cache entry. */
  982                 return;
  983         mutex = nfsrc_cachemutex(rp);
  984         mtx_lock(mutex);
  985         if (rp->rc_refcnt < 0)
  986                 panic("nfs cache refcnt");
  987         rp->rc_refcnt++;
  988         mtx_unlock(mutex);
  989 }
  990 
  991 /*
  992  * Dereference a seqid# cache entry.
  993  */
  994 void
  995 nfsrvd_derefcache(struct nfsrvcache *rp)
  996 {
  997         struct mtx *mutex;
  998 
  999         mutex = nfsrc_cachemutex(rp);
 1000         mtx_lock(mutex);
 1001         if (rp->rc_refcnt <= 0)
 1002                 panic("nfs cache derefcnt");
 1003         rp->rc_refcnt--;
 1004         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
 1005                 nfsrc_freecache(rp);
 1006         mtx_unlock(mutex);
 1007 }
 1008 
 1009 /*
 1010  * Calculate the length of the mbuf list and a checksum on the first up to
 1011  * NFSRVCACHE_CHECKLEN bytes.
 1012  */
 1013 static int
 1014 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
 1015 {
 1016         int len = 0, cklen;
 1017         struct mbuf *m;
 1018 
 1019         m = m1;
 1020         while (m) {
 1021                 len += m->m_len;
 1022                 m = m->m_next;
 1023         }
 1024         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
 1025         *cksum = in_cksum(m1, cklen);
 1026         return (len);
 1027 }
 1028 
 1029 /*
 1030  * Mark a TCP connection that is seeing retries. Should never happen for
 1031  * NFSv4.
 1032  */
 1033 static void
 1034 nfsrc_marksametcpconn(u_int64_t sockref)
 1035 {
 1036 }
Cache object: da11c27c729663aecdabc3e1ffa88c64
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/fs/nfsserver/nfs_nfsdcache.c

FreeBSD/Linux Kernel Cross Reference
sys/fs/nfsserver/nfs_nfsdcache.c