nfs_nfsdcache.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * This code is derived from software contributed to Berkeley by
    6  * Rick Macklem at The University of Guelph.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 4. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD: releng/10.0/sys/fs/nfsserver/nfs_nfsdcache.c 254337 2013-08-14 21:11:26Z rmacklem $");
   36 
   37 /*
   38  * Here is the basic algorithm:
   39  * First, some design criteria I used:
   40  * - I think a false hit is more serious than a false miss
   41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
   42  *   avoided at all cost
   43  * - A valid hit will probably happen a long time after the original reply
   44  *   and the TCP socket that the original request was received on will no
   45  *   longer be active
   46  *   (The long time delay implies to me that LRU is not appropriate.)
   47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
   48  *   in them as well as minimizing the risk of redoing retried non-idempotent
   49  *   Ops.
   50  * Because it is biased towards avoiding false hits, multiple entries with
   51  * the same xid are to be expected, especially for the case of the entry
   52  * in the cache being related to a seqid# sequenced Op.
   53  * 
   54  * The basic algorithm I'm about to code up:
   55  * - Null RPCs bypass the cache and are just done
   56  * For TCP
   57  *      - key on <xid, NFS version> (as noted above, there can be several
   58  *                                   entries with the same key)
   59  *      When a request arrives:
   60  *              For all that match key
   61  *              - if RPC# != OR request_size !=
   62  *                      - not a match with this one
   63  *              - if NFSv4 and received on same TCP socket OR
   64  *                      received on a TCP connection created before the
   65  *                      entry was cached
   66  *                      - not a match with this one
   67  *                      (V2,3 clients might retry on same TCP socket)
   68  *              - calculate checksum on first N bytes of NFS XDR
   69  *              - if checksum !=
   70  *                      - not a match for this one
   71  *              If any of the remaining ones that match has a
   72  *                      seqid_refcnt > 0
   73  *                      - not a match (go do RPC, using new cache entry)
   74  *              If one match left
   75  *                      - a hit (reply from cache)
   76  *              else
   77  *                      - miss (go do RPC, using new cache entry)
   78  * 
   79  *      During processing of NFSv4 request:
   80  *              - set a flag when a non-idempotent Op is processed
   81  *              - when an Op that uses a seqid# (Open,...) is processed
   82  *                      - if same seqid# as referenced entry in cache
   83  *                              - free new cache entry
   84  *                              - reply from referenced cache entry
   85  *                        else if next seqid# in order
   86  *                              - free referenced cache entry
   87  *                              - increment seqid_refcnt on new cache entry
   88  *                              - set pointer from Openowner/Lockowner to
   89  *                                      new cache entry (aka reference it)
   90  *                        else if first seqid# in sequence
   91  *                              - increment seqid_refcnt on new cache entry
   92  *                              - set pointer from Openowner/Lockowner to
   93  *                                      new cache entry (aka reference it)
   94  * 
   95  *      At end of RPC processing:
   96  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
   97  *                      cache entry
   98  *                      - save reply in cache entry
   99  *                      - calculate checksum on first N bytes of NFS XDR
  100  *                              request
  101  *                      - note op and length of XDR request (in bytes)
  102  *                      - timestamp it
  103  *                else
  104  *                      - free new cache entry
  105  *              - Send reply (noting info for socket activity check, below)
  106  * 
  107  *      For cache entries saved above:
  108  *              - if saved since seqid_refcnt was > 0
  109  *                      - free when seqid_refcnt decrements to 0
  110  *                        (when next one in sequence is processed above, or
  111  *                         when Openowner/Lockowner is discarded)
  112  *                else { non-idempotent Op(s) }
  113  *                      - free when
  114  *                              - some further activity observed on same
  115  *                                      socket
  116  *                                (I'm not yet sure how I'm going to do
  117  *                                 this. Maybe look at the TCP connection
  118  *                                 to see if the send_tcp_sequence# is well
  119  *                                 past sent reply OR K additional RPCs
  120  *                                 replied on same socket OR?)
  121  *                        OR
  122  *                              - when very old (hours, days, weeks?)
  123  * 
  124  * For UDP (v2, 3 only), pretty much the old way:
  125  * - key on <xid, NFS version, RPC#, Client host ip#>
  126  *   (at most one entry for each key)
  127  * 
  128  * When a Request arrives:
  129  * - if a match with entry via key
  130  *      - if RPC marked In_progress
  131  *              - discard request (don't send reply)
  132  *        else
  133  *              - reply from cache
  134  *              - timestamp cache entry
  135  *   else
  136  *      - add entry to cache, marked In_progress
  137  *      - do RPC
  138  *      - when RPC done
  139  *              - if RPC# non-idempotent
  140  *                      - mark entry Done (not In_progress)
  141  *                      - save reply
  142  *                      - timestamp cache entry
  143  *                else
  144  *                      - free cache entry
  145  *              - send reply
  146  * 
  147  * Later, entries with saved replies are free'd a short time (few minutes)
  148  * after reply sent (timestamp).
  149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
  150  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
  151  *              pages 53-63. San Diego, February 1989.
  152  *       for the UDP case.
  153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
  154  *      for TCP. For V3, a reply won't be saved when the flood level is
  155  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
  156  *      that case. This level should be set high enough that this almost
  157  *      never happens.
  158  */
  159 #ifndef APPLEKEXT
  160 #include <fs/nfs/nfsport.h>
  161 
  162 extern struct nfsstats newnfsstats;
  163 extern struct mtx nfsrc_udpmtx;
  164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
  165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
  166 #endif  /* !APPLEKEXT */
  167 
  168 SYSCTL_DECL(_vfs_nfsd);
  169 
  170 static u_int    nfsrc_tcphighwater = 0;
  171 static int
  172 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
  173 {
  174         int error, newhighwater;
  175 
  176         newhighwater = nfsrc_tcphighwater;
  177         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
  178         if (error != 0 || req->newptr == NULL)
  179                 return (error);
  180         if (newhighwater < 0)
  181                 return (EINVAL);
  182         if (newhighwater >= nfsrc_floodlevel)
  183                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
  184         nfsrc_tcphighwater = newhighwater;
  185         return (0);
  186 }
  187 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
  188     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
  189     "High water mark for TCP cache entries");
  190 
  191 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
  192 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
  193     &nfsrc_udphighwater, 0,
  194     "High water mark for UDP cache entries");
  195 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
  196 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
  197     &nfsrc_tcptimeout, 0,
  198     "Timeout for TCP entries in the DRC");
  199 static u_int nfsrc_tcpnonidempotent = 1;
  200 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
  201     &nfsrc_tcpnonidempotent, 0,
  202     "Enable the DRC for NFS over TCP");
  203 
  204 static int nfsrc_udpcachesize = 0;
  205 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
  206 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
  207 
  208 /*
  209  * and the reverse mapping from generic to Version 2 procedure numbers
  210  */
  211 static int newnfsv2_procid[NFS_V3NPROCS] = {
  212         NFSV2PROC_NULL,
  213         NFSV2PROC_GETATTR,
  214         NFSV2PROC_SETATTR,
  215         NFSV2PROC_LOOKUP,
  216         NFSV2PROC_NOOP,
  217         NFSV2PROC_READLINK,
  218         NFSV2PROC_READ,
  219         NFSV2PROC_WRITE,
  220         NFSV2PROC_CREATE,
  221         NFSV2PROC_MKDIR,
  222         NFSV2PROC_SYMLINK,
  223         NFSV2PROC_CREATE,
  224         NFSV2PROC_REMOVE,
  225         NFSV2PROC_RMDIR,
  226         NFSV2PROC_RENAME,
  227         NFSV2PROC_LINK,
  228         NFSV2PROC_READDIR,
  229         NFSV2PROC_NOOP,
  230         NFSV2PROC_STATFS,
  231         NFSV2PROC_NOOP,
  232         NFSV2PROC_NOOP,
  233         NFSV2PROC_NOOP,
  234 };
  235 
  236 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
  237 #define NFSRCUDPHASH(xid) \
  238         (&nfsrvudphashtbl[nfsrc_hash(xid)])
  239 #define NFSRCHASH(xid) \
  240         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
  241 #define TRUE    1
  242 #define FALSE   0
  243 #define NFSRVCACHE_CHECKLEN     100
  244 
  245 /* True iff the rpc reply is an nfs status ONLY! */
  246 static int nfsv2_repstat[NFS_V3NPROCS] = {
  247         FALSE,
  248         FALSE,
  249         FALSE,
  250         FALSE,
  251         FALSE,
  252         FALSE,
  253         FALSE,
  254         FALSE,
  255         FALSE,
  256         FALSE,
  257         TRUE,
  258         TRUE,
  259         TRUE,
  260         TRUE,
  261         FALSE,
  262         TRUE,
  263         FALSE,
  264         FALSE,
  265         FALSE,
  266         FALSE,
  267         FALSE,
  268         FALSE,
  269 };
  270 
  271 /*
  272  * Will NFS want to work over IPv6 someday?
  273  */
  274 #define NETFAMILY(rp) \
  275                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
  276 
  277 /* local functions */
  278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  280 static void nfsrc_lock(struct nfsrvcache *rp);
  281 static void nfsrc_unlock(struct nfsrvcache *rp);
  282 static void nfsrc_wanted(struct nfsrvcache *rp);
  283 static void nfsrc_freecache(struct nfsrvcache *rp);
  284 static void nfsrc_trimcache(u_int64_t, struct socket *);
  285 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
  286     struct socket *);
  287 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
  288 static void nfsrc_marksametcpconn(u_int64_t);
  289 
  290 /*
  291  * Return the correct mutex for this cache entry.
  292  */
  293 static __inline struct mtx *
  294 nfsrc_cachemutex(struct nfsrvcache *rp)
  295 {
  296 
  297         if ((rp->rc_flag & RC_UDP) != 0)
  298                 return (&nfsrc_udpmtx);
  299         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
  300 }
  301 
  302 /*
  303  * Initialize the server request cache list
  304  */
  305 APPLESTATIC void
  306 nfsrvd_initcache(void)
  307 {
  308         int i;
  309         static int inited = 0;
  310 
  311         if (inited)
  312                 return;
  313         inited = 1;
  314         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  315                 LIST_INIT(&nfsrvudphashtbl[i]);
  316                 LIST_INIT(&nfsrchash_table[i].tbl);
  317         }
  318         TAILQ_INIT(&nfsrvudplru);
  319         nfsrc_tcpsavedreplies = 0;
  320         nfsrc_udpcachesize = 0;
  321         newnfsstats.srvcache_tcppeak = 0;
  322         newnfsstats.srvcache_size = 0;
  323 }
  324 
  325 /*
  326  * Get a cache entry for this request. Basically just malloc a new one
  327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
  328  * Call nfsrc_trimcache() to clean up the cache before returning.
  329  */
  330 APPLESTATIC int
  331 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
  332 {
  333         struct nfsrvcache *newrp;
  334         int ret;
  335 
  336         if (nd->nd_procnum == NFSPROC_NULL)
  337                 panic("nfsd cache null");
  338         MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
  339             M_NFSRVCACHE, M_WAITOK);
  340         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
  341         if (nd->nd_flag & ND_NFSV4)
  342                 newrp->rc_flag = RC_NFSV4;
  343         else if (nd->nd_flag & ND_NFSV3)
  344                 newrp->rc_flag = RC_NFSV3;
  345         else
  346                 newrp->rc_flag = RC_NFSV2;
  347         newrp->rc_xid = nd->nd_retxid;
  348         newrp->rc_proc = nd->nd_procnum;
  349         newrp->rc_sockref = nd->nd_sockref;
  350         newrp->rc_cachetime = nd->nd_tcpconntime;
  351         if (nd->nd_flag & ND_SAMETCPCONN)
  352                 newrp->rc_flag |= RC_SAMETCPCONN;
  353         if (nd->nd_nam2 != NULL) {
  354                 newrp->rc_flag |= RC_UDP;
  355                 ret = nfsrc_getudp(nd, newrp);
  356         } else {
  357                 ret = nfsrc_gettcp(nd, newrp);
  358         }
  359         nfsrc_trimcache(nd->nd_sockref, so);
  360         NFSEXITCODE2(0, nd);
  361         return (ret);
  362 }
  363 
  364 /*
  365  * For UDP (v2, v3):
  366  * - key on <xid, NFS version, RPC#, Client host ip#>
  367  *   (at most one entry for each key)
  368  */
  369 static int
  370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  371 {
  372         struct nfsrvcache *rp;
  373         struct sockaddr_in *saddr;
  374         struct sockaddr_in6 *saddr6;
  375         struct nfsrvhashhead *hp;
  376         int ret = 0;
  377         struct mtx *mutex;
  378 
  379         mutex = nfsrc_cachemutex(newrp);
  380         hp = NFSRCUDPHASH(newrp->rc_xid);
  381 loop:
  382         mtx_lock(mutex);
  383         LIST_FOREACH(rp, hp, rc_hash) {
  384             if (newrp->rc_xid == rp->rc_xid &&
  385                 newrp->rc_proc == rp->rc_proc &&
  386                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  387                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
  388                         if ((rp->rc_flag & RC_LOCKED) != 0) {
  389                                 rp->rc_flag |= RC_WANTED;
  390                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  391                                     "nfsrc", 10 * hz);
  392                                 goto loop;
  393                         }
  394                         if (rp->rc_flag == 0)
  395                                 panic("nfs udp cache0");
  396                         rp->rc_flag |= RC_LOCKED;
  397                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  398                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  399                         if (rp->rc_flag & RC_INPROG) {
  400                                 newnfsstats.srvcache_inproghits++;
  401                                 mtx_unlock(mutex);
  402                                 ret = RC_DROPIT;
  403                         } else if (rp->rc_flag & RC_REPSTATUS) {
  404                                 /*
  405                                  * V2 only.
  406                                  */
  407                                 newnfsstats.srvcache_nonidemdonehits++;
  408                                 mtx_unlock(mutex);
  409                                 nfsrvd_rephead(nd);
  410                                 *(nd->nd_errp) = rp->rc_status;
  411                                 ret = RC_REPLY;
  412                                 rp->rc_timestamp = NFSD_MONOSEC +
  413                                         NFSRVCACHE_UDPTIMEOUT;
  414                         } else if (rp->rc_flag & RC_REPMBUF) {
  415                                 newnfsstats.srvcache_nonidemdonehits++;
  416                                 mtx_unlock(mutex);
  417                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  418                                         M_COPYALL, M_WAITOK);
  419                                 ret = RC_REPLY;
  420                                 rp->rc_timestamp = NFSD_MONOSEC +
  421                                         NFSRVCACHE_UDPTIMEOUT;
  422                         } else {
  423                                 panic("nfs udp cache1");
  424                         }
  425                         nfsrc_unlock(rp);
  426                         free((caddr_t)newrp, M_NFSRVCACHE);
  427                         goto out;
  428                 }
  429         }
  430         newnfsstats.srvcache_misses++;
  431         atomic_add_int(&newnfsstats.srvcache_size, 1);
  432         nfsrc_udpcachesize++;
  433 
  434         newrp->rc_flag |= RC_INPROG;
  435         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
  436         if (saddr->sin_family == AF_INET)
  437                 newrp->rc_inet = saddr->sin_addr.s_addr;
  438         else if (saddr->sin_family == AF_INET6) {
  439                 saddr6 = (struct sockaddr_in6 *)saddr;
  440                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
  441                     sizeof (struct in6_addr));
  442                 newrp->rc_flag |= RC_INETIPV6;
  443         }
  444         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  445         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
  446         mtx_unlock(mutex);
  447         nd->nd_rp = newrp;
  448         ret = RC_DOIT;
  449 
  450 out:
  451         NFSEXITCODE2(0, nd);
  452         return (ret);
  453 }
  454 
  455 /*
  456  * Update a request cache entry after the rpc has been done
  457  */
  458 APPLESTATIC struct nfsrvcache *
  459 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
  460 {
  461         struct nfsrvcache *rp;
  462         struct nfsrvcache *retrp = NULL;
  463         mbuf_t m;
  464         struct mtx *mutex;
  465 
  466         rp = nd->nd_rp;
  467         if (!rp)
  468                 panic("nfsrvd_updatecache null rp");
  469         nd->nd_rp = NULL;
  470         mutex = nfsrc_cachemutex(rp);
  471         mtx_lock(mutex);
  472         nfsrc_lock(rp);
  473         if (!(rp->rc_flag & RC_INPROG))
  474                 panic("nfsrvd_updatecache not inprog");
  475         rp->rc_flag &= ~RC_INPROG;
  476         if (rp->rc_flag & RC_UDP) {
  477                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  478                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  479         }
  480 
  481         /*
  482          * Reply from cache is a special case returned by nfsrv_checkseqid().
  483          */
  484         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
  485                 newnfsstats.srvcache_nonidemdonehits++;
  486                 mtx_unlock(mutex);
  487                 nd->nd_repstat = 0;
  488                 if (nd->nd_mreq)
  489                         mbuf_freem(nd->nd_mreq);
  490                 if (!(rp->rc_flag & RC_REPMBUF))
  491                         panic("reply from cache");
  492                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  493                     M_COPYALL, M_WAITOK);
  494                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  495                 nfsrc_unlock(rp);
  496                 goto out;
  497         }
  498 
  499         /*
  500          * If rc_refcnt > 0, save it
  501          * For UDP, save it if ND_SAVEREPLY is set
  502          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
  503          */
  504         if (nd->nd_repstat != NFSERR_DONTREPLY &&
  505             (rp->rc_refcnt > 0 ||
  506              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
  507              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
  508               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
  509               nfsrc_tcpnonidempotent))) {
  510                 if (rp->rc_refcnt > 0) {
  511                         if (!(rp->rc_flag & RC_NFSV4))
  512                                 panic("update_cache refcnt");
  513                         rp->rc_flag |= RC_REFCNT;
  514                 }
  515                 if ((nd->nd_flag & ND_NFSV2) &&
  516                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
  517                         rp->rc_status = nd->nd_repstat;
  518                         rp->rc_flag |= RC_REPSTATUS;
  519                         mtx_unlock(mutex);
  520                 } else {
  521                         if (!(rp->rc_flag & RC_UDP)) {
  522                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
  523                             if (nfsrc_tcpsavedreplies >
  524                                 newnfsstats.srvcache_tcppeak)
  525                                 newnfsstats.srvcache_tcppeak =
  526                                     nfsrc_tcpsavedreplies;
  527                         }
  528                         mtx_unlock(mutex);
  529                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
  530                         mtx_lock(mutex);
  531                         rp->rc_reply = m;
  532                         rp->rc_flag |= RC_REPMBUF;
  533                         mtx_unlock(mutex);
  534                 }
  535                 if (rp->rc_flag & RC_UDP) {
  536                         rp->rc_timestamp = NFSD_MONOSEC +
  537                             NFSRVCACHE_UDPTIMEOUT;
  538                         nfsrc_unlock(rp);
  539                 } else {
  540                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  541                         if (rp->rc_refcnt > 0)
  542                                 nfsrc_unlock(rp);
  543                         else
  544                                 retrp = rp;
  545                 }
  546         } else {
  547                 nfsrc_freecache(rp);
  548                 mtx_unlock(mutex);
  549         }
  550 
  551 out:
  552         nfsrc_trimcache(nd->nd_sockref, so);
  553         NFSEXITCODE2(0, nd);
  554         return (retrp);
  555 }
  556 
  557 /*
  558  * Invalidate and, if possible, free an in prog cache entry.
  559  * Must not sleep.
  560  */
  561 APPLESTATIC void
  562 nfsrvd_delcache(struct nfsrvcache *rp)
  563 {
  564         struct mtx *mutex;
  565 
  566         mutex = nfsrc_cachemutex(rp);
  567         if (!(rp->rc_flag & RC_INPROG))
  568                 panic("nfsrvd_delcache not in prog");
  569         mtx_lock(mutex);
  570         rp->rc_flag &= ~RC_INPROG;
  571         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
  572                 nfsrc_freecache(rp);
  573         mtx_unlock(mutex);
  574 }
  575 
  576 /*
  577  * Called after nfsrvd_updatecache() once the reply is sent, to update
  578  * the entry for nfsrc_activesocket() and unlock it. The argument is
  579  * the pointer returned by nfsrvd_updatecache().
  580  */
  581 APPLESTATIC void
  582 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
  583 {
  584         tcp_seq tmp_seq;
  585         struct mtx *mutex;
  586 
  587         mutex = nfsrc_cachemutex(rp);
  588         if (!(rp->rc_flag & RC_LOCKED))
  589                 panic("nfsrvd_sentcache not locked");
  590         if (!err) {
  591                 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
  592                      so->so_proto->pr_domain->dom_family != AF_INET6) ||
  593                      so->so_proto->pr_protocol != IPPROTO_TCP)
  594                         panic("nfs sent cache");
  595                 if (nfsrv_getsockseqnum(so, &tmp_seq)) {
  596                         mtx_lock(mutex);
  597                         rp->rc_tcpseq = tmp_seq;
  598                         rp->rc_flag |= RC_TCPSEQ;
  599                         mtx_unlock(mutex);
  600                 }
  601         }
  602         nfsrc_unlock(rp);
  603 }
  604 
  605 /*
  606  * Get a cache entry for TCP
  607  * - key on <xid, nfs version>
  608  *   (allow multiple entries for a given key)
  609  */
  610 static int
  611 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  612 {
  613         struct nfsrvcache *rp, *nextrp;
  614         int i;
  615         struct nfsrvcache *hitrp;
  616         struct nfsrvhashhead *hp, nfsrc_templist;
  617         int hit, ret = 0;
  618         struct mtx *mutex;
  619 
  620         mutex = nfsrc_cachemutex(newrp);
  621         hp = NFSRCHASH(newrp->rc_xid);
  622         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
  623 tryagain:
  624         mtx_lock(mutex);
  625         hit = 1;
  626         LIST_INIT(&nfsrc_templist);
  627         /*
  628          * Get all the matches and put them on the temp list.
  629          */
  630         rp = LIST_FIRST(hp);
  631         while (rp != LIST_END(hp)) {
  632                 nextrp = LIST_NEXT(rp, rc_hash);
  633                 if (newrp->rc_xid == rp->rc_xid &&
  634                     (!(rp->rc_flag & RC_INPROG) ||
  635                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
  636                       newrp->rc_sockref == rp->rc_sockref)) &&
  637                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  638                     newrp->rc_proc == rp->rc_proc &&
  639                     ((newrp->rc_flag & RC_NFSV4) &&
  640                      newrp->rc_sockref != rp->rc_sockref &&
  641                      newrp->rc_cachetime >= rp->rc_cachetime)
  642                     && newrp->rc_reqlen == rp->rc_reqlen &&
  643                     newrp->rc_cksum == rp->rc_cksum) {
  644                         LIST_REMOVE(rp, rc_hash);
  645                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
  646                 }
  647                 rp = nextrp;
  648         }
  649 
  650         /*
  651          * Now, use nfsrc_templist to decide if there is a match.
  652          */
  653         i = 0;
  654         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
  655                 i++;
  656                 if (rp->rc_refcnt > 0) {
  657                         hit = 0;
  658                         break;
  659                 }
  660         }
  661         /*
  662          * Can be a hit only if one entry left.
  663          * Note possible hit entry and put nfsrc_templist back on hash
  664          * list.
  665          */
  666         if (i != 1)
  667                 hit = 0;
  668         hitrp = rp = LIST_FIRST(&nfsrc_templist);
  669         while (rp != LIST_END(&nfsrc_templist)) {
  670                 nextrp = LIST_NEXT(rp, rc_hash);
  671                 LIST_REMOVE(rp, rc_hash);
  672                 LIST_INSERT_HEAD(hp, rp, rc_hash);
  673                 rp = nextrp;
  674         }
  675         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
  676                 panic("nfs gettcp cache templist");
  677 
  678         if (hit) {
  679                 rp = hitrp;
  680                 if ((rp->rc_flag & RC_LOCKED) != 0) {
  681                         rp->rc_flag |= RC_WANTED;
  682                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  683                             "nfsrc", 10 * hz);
  684                         goto tryagain;
  685                 }
  686                 if (rp->rc_flag == 0)
  687                         panic("nfs tcp cache0");
  688                 rp->rc_flag |= RC_LOCKED;
  689                 if (rp->rc_flag & RC_INPROG) {
  690                         newnfsstats.srvcache_inproghits++;
  691                         mtx_unlock(mutex);
  692                         if (newrp->rc_sockref == rp->rc_sockref)
  693                                 nfsrc_marksametcpconn(rp->rc_sockref);
  694                         ret = RC_DROPIT;
  695                 } else if (rp->rc_flag & RC_REPSTATUS) {
  696                         /*
  697                          * V2 only.
  698                          */
  699                         newnfsstats.srvcache_nonidemdonehits++;
  700                         mtx_unlock(mutex);
  701                         if (newrp->rc_sockref == rp->rc_sockref)
  702                                 nfsrc_marksametcpconn(rp->rc_sockref);
  703                         ret = RC_REPLY;
  704                         nfsrvd_rephead(nd);
  705                         *(nd->nd_errp) = rp->rc_status;
  706                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  707                 } else if (rp->rc_flag & RC_REPMBUF) {
  708                         newnfsstats.srvcache_nonidemdonehits++;
  709                         mtx_unlock(mutex);
  710                         if (newrp->rc_sockref == rp->rc_sockref)
  711                                 nfsrc_marksametcpconn(rp->rc_sockref);
  712                         ret = RC_REPLY;
  713                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
  714                                 M_COPYALL, M_WAITOK);
  715                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  716                 } else {
  717                         panic("nfs tcp cache1");
  718                 }
  719                 nfsrc_unlock(rp);
  720                 free((caddr_t)newrp, M_NFSRVCACHE);
  721                 goto out;
  722         }
  723         newnfsstats.srvcache_misses++;
  724         atomic_add_int(&newnfsstats.srvcache_size, 1);
  725 
  726         /*
  727          * For TCP, multiple entries for a key are allowed, so don't
  728          * chain it into the hash table until done.
  729          */
  730         newrp->rc_cachetime = NFSD_MONOSEC;
  731         newrp->rc_flag |= RC_INPROG;
  732         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  733         mtx_unlock(mutex);
  734         nd->nd_rp = newrp;
  735         ret = RC_DOIT;
  736 
  737 out:
  738         NFSEXITCODE2(0, nd);
  739         return (ret);
  740 }
  741 
  742 /*
  743  * Lock a cache entry.
  744  */
  745 static void
  746 nfsrc_lock(struct nfsrvcache *rp)
  747 {
  748         struct mtx *mutex;
  749 
  750         mutex = nfsrc_cachemutex(rp);
  751         mtx_assert(mutex, MA_OWNED);
  752         while ((rp->rc_flag & RC_LOCKED) != 0) {
  753                 rp->rc_flag |= RC_WANTED;
  754                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
  755         }
  756         rp->rc_flag |= RC_LOCKED;
  757 }
  758 
  759 /*
  760  * Unlock a cache entry.
  761  */
  762 static void
  763 nfsrc_unlock(struct nfsrvcache *rp)
  764 {
  765         struct mtx *mutex;
  766 
  767         mutex = nfsrc_cachemutex(rp);
  768         mtx_lock(mutex);
  769         rp->rc_flag &= ~RC_LOCKED;
  770         nfsrc_wanted(rp);
  771         mtx_unlock(mutex);
  772 }
  773 
  774 /*
  775  * Wakeup anyone wanting entry.
  776  */
  777 static void
  778 nfsrc_wanted(struct nfsrvcache *rp)
  779 {
  780         if (rp->rc_flag & RC_WANTED) {
  781                 rp->rc_flag &= ~RC_WANTED;
  782                 wakeup((caddr_t)rp);
  783         }
  784 }
  785 
  786 /*
  787  * Free up the entry.
  788  * Must not sleep.
  789  */
  790 static void
  791 nfsrc_freecache(struct nfsrvcache *rp)
  792 {
  793 
  794         LIST_REMOVE(rp, rc_hash);
  795         if (rp->rc_flag & RC_UDP) {
  796                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  797                 nfsrc_udpcachesize--;
  798         }
  799         nfsrc_wanted(rp);
  800         if (rp->rc_flag & RC_REPMBUF) {
  801                 mbuf_freem(rp->rc_reply);
  802                 if (!(rp->rc_flag & RC_UDP))
  803                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
  804         }
  805         FREE((caddr_t)rp, M_NFSRVCACHE);
  806         atomic_add_int(&newnfsstats.srvcache_size, -1);
  807 }
  808 
  809 /*
  810  * Clean out the cache. Called when nfsserver module is unloaded.
  811  */
  812 APPLESTATIC void
  813 nfsrvd_cleancache(void)
  814 {
  815         struct nfsrvcache *rp, *nextrp;
  816         int i;
  817 
  818         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  819                 mtx_lock(&nfsrchash_table[i].mtx);
  820                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
  821                         nfsrc_freecache(rp);
  822                 mtx_unlock(&nfsrchash_table[i].mtx);
  823         }
  824         mtx_lock(&nfsrc_udpmtx);
  825         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  826                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
  827                         nfsrc_freecache(rp);
  828                 }
  829         }
  830         newnfsstats.srvcache_size = 0;
  831         mtx_unlock(&nfsrc_udpmtx);
  832         nfsrc_tcpsavedreplies = 0;
  833 }
  834 
  835 /*
  836  * The basic rule is to get rid of entries that are expired.
  837  */
  838 static void
  839 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
  840 {
  841         struct nfsrvcache *rp, *nextrp;
  842         int i, j, k, time_histo[10];
  843         time_t thisstamp;
  844         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
  845         static int onethread = 0;
  846 
  847         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
  848                 return;
  849         if (NFSD_MONOSEC != udp_lasttrim ||
  850             nfsrc_udpcachesize >= (nfsrc_udphighwater +
  851             nfsrc_udphighwater / 2)) {
  852                 mtx_lock(&nfsrc_udpmtx);
  853                 udp_lasttrim = NFSD_MONOSEC;
  854                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
  855                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
  856                              && rp->rc_refcnt == 0
  857                              && ((rp->rc_flag & RC_REFCNT) ||
  858                                  udp_lasttrim > rp->rc_timestamp ||
  859                                  nfsrc_udpcachesize > nfsrc_udphighwater))
  860                                 nfsrc_freecache(rp);
  861                 }
  862                 mtx_unlock(&nfsrc_udpmtx);
  863         }
  864         if (NFSD_MONOSEC != tcp_lasttrim ||
  865             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
  866                 for (i = 0; i < 10; i++)
  867                         time_histo[i] = 0;
  868                 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  869                         mtx_lock(&nfsrchash_table[i].mtx);
  870                         if (i == 0)
  871                                 tcp_lasttrim = NFSD_MONOSEC;
  872                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
  873                             nextrp) {
  874                                 if (!(rp->rc_flag &
  875                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
  876                                      && rp->rc_refcnt == 0) {
  877                                         /*
  878                                          * The timestamps range from roughly the
  879                                          * present (tcp_lasttrim) to the present
  880                                          * + nfsrc_tcptimeout. Generate a simple
  881                                          * histogram of where the timeouts fall.
  882                                          */
  883                                         j = rp->rc_timestamp - tcp_lasttrim;
  884                                         if (j >= nfsrc_tcptimeout)
  885                                                 j = nfsrc_tcptimeout - 1;
  886                                         if (j < 0)
  887                                                 j = 0;
  888                                         j = (j * 10 / nfsrc_tcptimeout) % 10;
  889                                         time_histo[j]++;
  890                                         if ((rp->rc_flag & RC_REFCNT) ||
  891                                             tcp_lasttrim > rp->rc_timestamp ||
  892                                             nfsrc_activesocket(rp, sockref, so))
  893                                                 nfsrc_freecache(rp);
  894                                 }
  895                         }
  896                         mtx_unlock(&nfsrchash_table[i].mtx);
  897                 }
  898                 j = nfsrc_tcphighwater / 5;     /* 20% of it */
  899                 if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
  900                         /*
  901                          * Trim some more with a smaller timeout of as little
  902                          * as 20% of nfsrc_tcptimeout to try and get below
  903                          * 80% of the nfsrc_tcphighwater.
  904                          */
  905                         k = 0;
  906                         for (i = 0; i < 8; i++) {
  907                                 k += time_histo[i];
  908                                 if (k > j)
  909                                         break;
  910                         }
  911                         k = nfsrc_tcptimeout * (i + 1) / 10;
  912                         if (k < 1)
  913                                 k = 1;
  914                         thisstamp = tcp_lasttrim + k;
  915                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  916                                 mtx_lock(&nfsrchash_table[i].mtx);
  917                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
  918                                     rc_hash, nextrp) {
  919                                         if (!(rp->rc_flag &
  920                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
  921                                              && rp->rc_refcnt == 0
  922                                              && ((rp->rc_flag & RC_REFCNT) ||
  923                                                  thisstamp > rp->rc_timestamp ||
  924                                                  nfsrc_activesocket(rp, sockref,
  925                                                     so)))
  926                                                 nfsrc_freecache(rp);
  927                                 }
  928                                 mtx_unlock(&nfsrchash_table[i].mtx);
  929                         }
  930                 }
  931         }
  932         atomic_store_rel_int(&onethread, 0);
  933 }
  934 
  935 /*
  936  * Add a seqid# reference to the cache entry.
  937  */
  938 APPLESTATIC void
  939 nfsrvd_refcache(struct nfsrvcache *rp)
  940 {
  941         struct mtx *mutex;
  942 
  943         mutex = nfsrc_cachemutex(rp);
  944         mtx_lock(mutex);
  945         if (rp->rc_refcnt < 0)
  946                 panic("nfs cache refcnt");
  947         rp->rc_refcnt++;
  948         mtx_unlock(mutex);
  949 }
  950 
  951 /*
  952  * Dereference a seqid# cache entry.
  953  */
  954 APPLESTATIC void
  955 nfsrvd_derefcache(struct nfsrvcache *rp)
  956 {
  957         struct mtx *mutex;
  958 
  959         mutex = nfsrc_cachemutex(rp);
  960         mtx_lock(mutex);
  961         if (rp->rc_refcnt <= 0)
  962                 panic("nfs cache derefcnt");
  963         rp->rc_refcnt--;
  964         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
  965                 nfsrc_freecache(rp);
  966         mtx_unlock(mutex);
  967 }
  968 
  969 /*
  970  * Check to see if the socket is active.
  971  * Return 1 if the reply has been received/acknowledged by the client,
  972  * 0 otherwise.
  973  * XXX - Uses tcp internals.
  974  */
  975 static int
  976 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
  977     struct socket *cur_so)
  978 {
  979         int ret = 0;
  980 
  981         if (!(rp->rc_flag & RC_TCPSEQ))
  982                 return (ret);
  983         /*
  984          * If the sockref is the same, it is the same TCP connection.
  985          */
  986         if (cur_sockref == rp->rc_sockref)
  987                 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
  988         return (ret);
  989 }
  990 
  991 /*
  992  * Calculate the length of the mbuf list and a checksum on the first up to
  993  * NFSRVCACHE_CHECKLEN bytes.
  994  */
  995 static int
  996 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
  997 {
  998         int len = 0, cklen;
  999         mbuf_t m;
 1000 
 1001         m = m1;
 1002         while (m) {
 1003                 len += mbuf_len(m);
 1004                 m = mbuf_next(m);
 1005         }
 1006         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
 1007         *cksum = in_cksum(m1, cklen);
 1008         return (len);
 1009 }
 1010 
 1011 /*
 1012  * Mark a TCP connection that is seeing retries. Should never happen for
 1013  * NFSv4.
 1014  */
 1015 static void
 1016 nfsrc_marksametcpconn(u_int64_t sockref)
 1017 {
 1018 }
 1019
Cache object: fa85e763d6f22ca7d9a3e922f0fac167
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/fs/nfsserver/nfs_nfsdcache.c

FreeBSD/Linux Kernel Cross Reference
sys/fs/nfsserver/nfs_nfsdcache.c