nfs_nfsdcache.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * Rick Macklem at The University of Guelph.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  */
   35 
   36 #include <sys/cdefs.h>
   37 __FBSDID("$FreeBSD: releng/12.0/sys/fs/nfsserver/nfs_nfsdcache.c 328417 2018-01-25 22:25:13Z cem $");
   38 
   39 /*
   40  * Here is the basic algorithm:
   41  * First, some design criteria I used:
   42  * - I think a false hit is more serious than a false miss
   43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
   44  *   avoided at all cost
   45  * - A valid hit will probably happen a long time after the original reply
   46  *   and the TCP socket that the original request was received on will no
   47  *   longer be active
   48  *   (The long time delay implies to me that LRU is not appropriate.)
   49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
   50  *   in them as well as minimizing the risk of redoing retried non-idempotent
   51  *   Ops.
   52  * Because it is biased towards avoiding false hits, multiple entries with
   53  * the same xid are to be expected, especially for the case of the entry
   54  * in the cache being related to a seqid# sequenced Op.
   55  * 
   56  * The basic algorithm I'm about to code up:
   57  * - Null RPCs bypass the cache and are just done
   58  * For TCP
   59  *      - key on <xid, NFS version> (as noted above, there can be several
   60  *                                   entries with the same key)
   61  *      When a request arrives:
   62  *              For all that match key
   63  *              - if RPC# != OR request_size !=
   64  *                      - not a match with this one
   65  *              - if NFSv4 and received on same TCP socket OR
   66  *                      received on a TCP connection created before the
   67  *                      entry was cached
   68  *                      - not a match with this one
   69  *                      (V2,3 clients might retry on same TCP socket)
   70  *              - calculate checksum on first N bytes of NFS XDR
   71  *              - if checksum !=
   72  *                      - not a match for this one
   73  *              If any of the remaining ones that match has a
   74  *                      seqid_refcnt > 0
   75  *                      - not a match (go do RPC, using new cache entry)
   76  *              If one match left
   77  *                      - a hit (reply from cache)
   78  *              else
   79  *                      - miss (go do RPC, using new cache entry)
   80  * 
   81  *      During processing of NFSv4 request:
   82  *              - set a flag when a non-idempotent Op is processed
   83  *              - when an Op that uses a seqid# (Open,...) is processed
   84  *                      - if same seqid# as referenced entry in cache
   85  *                              - free new cache entry
   86  *                              - reply from referenced cache entry
   87  *                        else if next seqid# in order
   88  *                              - free referenced cache entry
   89  *                              - increment seqid_refcnt on new cache entry
   90  *                              - set pointer from Openowner/Lockowner to
   91  *                                      new cache entry (aka reference it)
   92  *                        else if first seqid# in sequence
   93  *                              - increment seqid_refcnt on new cache entry
   94  *                              - set pointer from Openowner/Lockowner to
   95  *                                      new cache entry (aka reference it)
   96  * 
   97  *      At end of RPC processing:
   98  *              - if seqid_refcnt > 0 OR flagged non-idempotent on new
   99  *                      cache entry
  100  *                      - save reply in cache entry
  101  *                      - calculate checksum on first N bytes of NFS XDR
  102  *                              request
  103  *                      - note op and length of XDR request (in bytes)
  104  *                      - timestamp it
  105  *                else
  106  *                      - free new cache entry
  107  *              - Send reply (noting info for socket activity check, below)
  108  * 
  109  *      For cache entries saved above:
  110  *              - if saved since seqid_refcnt was > 0
  111  *                      - free when seqid_refcnt decrements to 0
  112  *                        (when next one in sequence is processed above, or
  113  *                         when Openowner/Lockowner is discarded)
  114  *                else { non-idempotent Op(s) }
  115  *                      - free when
  116  *                              - some further activity observed on same
  117  *                                      socket
  118  *                                (I'm not yet sure how I'm going to do
  119  *                                 this. Maybe look at the TCP connection
  120  *                                 to see if the send_tcp_sequence# is well
  121  *                                 past sent reply OR K additional RPCs
  122  *                                 replied on same socket OR?)
  123  *                        OR
  124  *                              - when very old (hours, days, weeks?)
  125  * 
  126  * For UDP (v2, 3 only), pretty much the old way:
  127  * - key on <xid, NFS version, RPC#, Client host ip#>
  128  *   (at most one entry for each key)
  129  * 
  130  * When a Request arrives:
  131  * - if a match with entry via key
  132  *      - if RPC marked In_progress
  133  *              - discard request (don't send reply)
  134  *        else
  135  *              - reply from cache
  136  *              - timestamp cache entry
  137  *   else
  138  *      - add entry to cache, marked In_progress
  139  *      - do RPC
  140  *      - when RPC done
  141  *              - if RPC# non-idempotent
  142  *                      - mark entry Done (not In_progress)
  143  *                      - save reply
  144  *                      - timestamp cache entry
  145  *                else
  146  *                      - free cache entry
  147  *              - send reply
  148  * 
  149  * Later, entries with saved replies are free'd a short time (few minutes)
  150  * after reply sent (timestamp).
  151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
  152  *              of an NFS Server", in Proc. Winter 1989 USENIX Conference,
  153  *              pages 53-63. San Diego, February 1989.
  154  *       for the UDP case.
  155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
  156  *      for TCP. For V3, a reply won't be saved when the flood level is
  157  *      hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
  158  *      that case. This level should be set high enough that this almost
  159  *      never happens.
  160  */
  161 #ifndef APPLEKEXT
  162 #include <fs/nfs/nfsport.h>
  163 
  164 extern struct nfsstatsv1 nfsstatsv1;
  165 extern struct mtx nfsrc_udpmtx;
  166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
  167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
  168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
  169 #endif  /* !APPLEKEXT */
  170 
  171 SYSCTL_DECL(_vfs_nfsd);
  172 
  173 static u_int    nfsrc_tcphighwater = 0;
  174 static int
  175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
  176 {
  177         int error, newhighwater;
  178 
  179         newhighwater = nfsrc_tcphighwater;
  180         error = sysctl_handle_int(oidp, &newhighwater, 0, req);
  181         if (error != 0 || req->newptr == NULL)
  182                 return (error);
  183         if (newhighwater < 0)
  184                 return (EINVAL);
  185         if (newhighwater >= nfsrc_floodlevel)
  186                 nfsrc_floodlevel = newhighwater + newhighwater / 5;
  187         nfsrc_tcphighwater = newhighwater;
  188         return (0);
  189 }
  190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
  191     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
  192     "High water mark for TCP cache entries");
  193 
  194 static u_int    nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
  195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
  196     &nfsrc_udphighwater, 0,
  197     "High water mark for UDP cache entries");
  198 static u_int    nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
  199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
  200     &nfsrc_tcptimeout, 0,
  201     "Timeout for TCP entries in the DRC");
  202 static u_int nfsrc_tcpnonidempotent = 1;
  203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
  204     &nfsrc_tcpnonidempotent, 0,
  205     "Enable the DRC for NFS over TCP");
  206 
  207 static int nfsrc_udpcachesize = 0;
  208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
  209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
  210 
  211 /*
  212  * and the reverse mapping from generic to Version 2 procedure numbers
  213  */
  214 static int newnfsv2_procid[NFS_V3NPROCS] = {
  215         NFSV2PROC_NULL,
  216         NFSV2PROC_GETATTR,
  217         NFSV2PROC_SETATTR,
  218         NFSV2PROC_LOOKUP,
  219         NFSV2PROC_NOOP,
  220         NFSV2PROC_READLINK,
  221         NFSV2PROC_READ,
  222         NFSV2PROC_WRITE,
  223         NFSV2PROC_CREATE,
  224         NFSV2PROC_MKDIR,
  225         NFSV2PROC_SYMLINK,
  226         NFSV2PROC_CREATE,
  227         NFSV2PROC_REMOVE,
  228         NFSV2PROC_RMDIR,
  229         NFSV2PROC_RENAME,
  230         NFSV2PROC_LINK,
  231         NFSV2PROC_READDIR,
  232         NFSV2PROC_NOOP,
  233         NFSV2PROC_STATFS,
  234         NFSV2PROC_NOOP,
  235         NFSV2PROC_NOOP,
  236         NFSV2PROC_NOOP,
  237 };
  238 
  239 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
  240 #define NFSRCUDPHASH(xid) \
  241         (&nfsrvudphashtbl[nfsrc_hash(xid)])
  242 #define NFSRCHASH(xid) \
  243         (&nfsrchash_table[nfsrc_hash(xid)].tbl)
  244 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
  245 #define TRUE    1
  246 #define FALSE   0
  247 #define NFSRVCACHE_CHECKLEN     100
  248 
  249 /* True iff the rpc reply is an nfs status ONLY! */
  250 static int nfsv2_repstat[NFS_V3NPROCS] = {
  251         FALSE,
  252         FALSE,
  253         FALSE,
  254         FALSE,
  255         FALSE,
  256         FALSE,
  257         FALSE,
  258         FALSE,
  259         FALSE,
  260         FALSE,
  261         TRUE,
  262         TRUE,
  263         TRUE,
  264         TRUE,
  265         FALSE,
  266         TRUE,
  267         FALSE,
  268         FALSE,
  269         FALSE,
  270         FALSE,
  271         FALSE,
  272         FALSE,
  273 };
  274 
  275 /*
  276  * Will NFS want to work over IPv6 someday?
  277  */
  278 #define NETFAMILY(rp) \
  279                 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
  280 
  281 /* local functions */
  282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
  284 static void nfsrc_lock(struct nfsrvcache *rp);
  285 static void nfsrc_unlock(struct nfsrvcache *rp);
  286 static void nfsrc_wanted(struct nfsrvcache *rp);
  287 static void nfsrc_freecache(struct nfsrvcache *rp);
  288 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
  289 static void nfsrc_marksametcpconn(u_int64_t);
  290 
  291 /*
  292  * Return the correct mutex for this cache entry.
  293  */
  294 static __inline struct mtx *
  295 nfsrc_cachemutex(struct nfsrvcache *rp)
  296 {
  297 
  298         if ((rp->rc_flag & RC_UDP) != 0)
  299                 return (&nfsrc_udpmtx);
  300         return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
  301 }
  302 
  303 /*
  304  * Initialize the server request cache list
  305  */
  306 APPLESTATIC void
  307 nfsrvd_initcache(void)
  308 {
  309         int i;
  310         static int inited = 0;
  311 
  312         if (inited)
  313                 return;
  314         inited = 1;
  315         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  316                 LIST_INIT(&nfsrvudphashtbl[i]);
  317                 LIST_INIT(&nfsrchash_table[i].tbl);
  318                 LIST_INIT(&nfsrcahash_table[i].tbl);
  319         }
  320         TAILQ_INIT(&nfsrvudplru);
  321         nfsrc_tcpsavedreplies = 0;
  322         nfsrc_udpcachesize = 0;
  323         nfsstatsv1.srvcache_tcppeak = 0;
  324         nfsstatsv1.srvcache_size = 0;
  325 }
  326 
  327 /*
  328  * Get a cache entry for this request. Basically just malloc a new one
  329  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
  330  */
  331 APPLESTATIC int
  332 nfsrvd_getcache(struct nfsrv_descript *nd)
  333 {
  334         struct nfsrvcache *newrp;
  335         int ret;
  336 
  337         if (nd->nd_procnum == NFSPROC_NULL)
  338                 panic("nfsd cache null");
  339         newrp = malloc(sizeof (struct nfsrvcache),
  340             M_NFSRVCACHE, M_WAITOK);
  341         NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
  342         if (nd->nd_flag & ND_NFSV4)
  343                 newrp->rc_flag = RC_NFSV4;
  344         else if (nd->nd_flag & ND_NFSV3)
  345                 newrp->rc_flag = RC_NFSV3;
  346         else
  347                 newrp->rc_flag = RC_NFSV2;
  348         newrp->rc_xid = nd->nd_retxid;
  349         newrp->rc_proc = nd->nd_procnum;
  350         newrp->rc_sockref = nd->nd_sockref;
  351         newrp->rc_cachetime = nd->nd_tcpconntime;
  352         if (nd->nd_flag & ND_SAMETCPCONN)
  353                 newrp->rc_flag |= RC_SAMETCPCONN;
  354         if (nd->nd_nam2 != NULL) {
  355                 newrp->rc_flag |= RC_UDP;
  356                 ret = nfsrc_getudp(nd, newrp);
  357         } else {
  358                 ret = nfsrc_gettcp(nd, newrp);
  359         }
  360         NFSEXITCODE2(0, nd);
  361         return (ret);
  362 }
  363 
  364 /*
  365  * For UDP (v2, v3):
  366  * - key on <xid, NFS version, RPC#, Client host ip#>
  367  *   (at most one entry for each key)
  368  */
  369 static int
  370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  371 {
  372         struct nfsrvcache *rp;
  373         struct sockaddr_in *saddr;
  374         struct sockaddr_in6 *saddr6;
  375         struct nfsrvhashhead *hp;
  376         int ret = 0;
  377         struct mtx *mutex;
  378 
  379         mutex = nfsrc_cachemutex(newrp);
  380         hp = NFSRCUDPHASH(newrp->rc_xid);
  381 loop:
  382         mtx_lock(mutex);
  383         LIST_FOREACH(rp, hp, rc_hash) {
  384             if (newrp->rc_xid == rp->rc_xid &&
  385                 newrp->rc_proc == rp->rc_proc &&
  386                 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  387                 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
  388                         if ((rp->rc_flag & RC_LOCKED) != 0) {
  389                                 rp->rc_flag |= RC_WANTED;
  390                                 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  391                                     "nfsrc", 10 * hz);
  392                                 goto loop;
  393                         }
  394                         if (rp->rc_flag == 0)
  395                                 panic("nfs udp cache0");
  396                         rp->rc_flag |= RC_LOCKED;
  397                         TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  398                         TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  399                         if (rp->rc_flag & RC_INPROG) {
  400                                 nfsstatsv1.srvcache_inproghits++;
  401                                 mtx_unlock(mutex);
  402                                 ret = RC_DROPIT;
  403                         } else if (rp->rc_flag & RC_REPSTATUS) {
  404                                 /*
  405                                  * V2 only.
  406                                  */
  407                                 nfsstatsv1.srvcache_nonidemdonehits++;
  408                                 mtx_unlock(mutex);
  409                                 nfsrvd_rephead(nd);
  410                                 *(nd->nd_errp) = rp->rc_status;
  411                                 ret = RC_REPLY;
  412                                 rp->rc_timestamp = NFSD_MONOSEC +
  413                                         NFSRVCACHE_UDPTIMEOUT;
  414                         } else if (rp->rc_flag & RC_REPMBUF) {
  415                                 nfsstatsv1.srvcache_nonidemdonehits++;
  416                                 mtx_unlock(mutex);
  417                                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  418                                         M_COPYALL, M_WAITOK);
  419                                 ret = RC_REPLY;
  420                                 rp->rc_timestamp = NFSD_MONOSEC +
  421                                         NFSRVCACHE_UDPTIMEOUT;
  422                         } else {
  423                                 panic("nfs udp cache1");
  424                         }
  425                         nfsrc_unlock(rp);
  426                         free(newrp, M_NFSRVCACHE);
  427                         goto out;
  428                 }
  429         }
  430         nfsstatsv1.srvcache_misses++;
  431         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
  432         nfsrc_udpcachesize++;
  433 
  434         newrp->rc_flag |= RC_INPROG;
  435         saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
  436         if (saddr->sin_family == AF_INET)
  437                 newrp->rc_inet = saddr->sin_addr.s_addr;
  438         else if (saddr->sin_family == AF_INET6) {
  439                 saddr6 = (struct sockaddr_in6 *)saddr;
  440                 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
  441                     sizeof (struct in6_addr));
  442                 newrp->rc_flag |= RC_INETIPV6;
  443         }
  444         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  445         TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
  446         mtx_unlock(mutex);
  447         nd->nd_rp = newrp;
  448         ret = RC_DOIT;
  449 
  450 out:
  451         NFSEXITCODE2(0, nd);
  452         return (ret);
  453 }
  454 
  455 /*
  456  * Update a request cache entry after the rpc has been done
  457  */
  458 APPLESTATIC struct nfsrvcache *
  459 nfsrvd_updatecache(struct nfsrv_descript *nd)
  460 {
  461         struct nfsrvcache *rp;
  462         struct nfsrvcache *retrp = NULL;
  463         mbuf_t m;
  464         struct mtx *mutex;
  465 
  466         rp = nd->nd_rp;
  467         if (!rp)
  468                 panic("nfsrvd_updatecache null rp");
  469         nd->nd_rp = NULL;
  470         mutex = nfsrc_cachemutex(rp);
  471         mtx_lock(mutex);
  472         nfsrc_lock(rp);
  473         if (!(rp->rc_flag & RC_INPROG))
  474                 panic("nfsrvd_updatecache not inprog");
  475         rp->rc_flag &= ~RC_INPROG;
  476         if (rp->rc_flag & RC_UDP) {
  477                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  478                 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
  479         }
  480 
  481         /*
  482          * Reply from cache is a special case returned by nfsrv_checkseqid().
  483          */
  484         if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
  485                 nfsstatsv1.srvcache_nonidemdonehits++;
  486                 mtx_unlock(mutex);
  487                 nd->nd_repstat = 0;
  488                 if (nd->nd_mreq)
  489                         mbuf_freem(nd->nd_mreq);
  490                 if (!(rp->rc_flag & RC_REPMBUF))
  491                         panic("reply from cache");
  492                 nd->nd_mreq = m_copym(rp->rc_reply, 0,
  493                     M_COPYALL, M_WAITOK);
  494                 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  495                 nfsrc_unlock(rp);
  496                 goto out;
  497         }
  498 
  499         /*
  500          * If rc_refcnt > 0, save it
  501          * For UDP, save it if ND_SAVEREPLY is set
  502          * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
  503          */
  504         if (nd->nd_repstat != NFSERR_DONTREPLY &&
  505             (rp->rc_refcnt > 0 ||
  506              ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
  507              ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
  508               nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
  509               nfsrc_tcpnonidempotent))) {
  510                 if (rp->rc_refcnt > 0) {
  511                         if (!(rp->rc_flag & RC_NFSV4))
  512                                 panic("update_cache refcnt");
  513                         rp->rc_flag |= RC_REFCNT;
  514                 }
  515                 if ((nd->nd_flag & ND_NFSV2) &&
  516                     nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
  517                         rp->rc_status = nd->nd_repstat;
  518                         rp->rc_flag |= RC_REPSTATUS;
  519                         mtx_unlock(mutex);
  520                 } else {
  521                         if (!(rp->rc_flag & RC_UDP)) {
  522                             atomic_add_int(&nfsrc_tcpsavedreplies, 1);
  523                             if (nfsrc_tcpsavedreplies >
  524                                 nfsstatsv1.srvcache_tcppeak)
  525                                 nfsstatsv1.srvcache_tcppeak =
  526                                     nfsrc_tcpsavedreplies;
  527                         }
  528                         mtx_unlock(mutex);
  529                         m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
  530                         mtx_lock(mutex);
  531                         rp->rc_reply = m;
  532                         rp->rc_flag |= RC_REPMBUF;
  533                         mtx_unlock(mutex);
  534                 }
  535                 if (rp->rc_flag & RC_UDP) {
  536                         rp->rc_timestamp = NFSD_MONOSEC +
  537                             NFSRVCACHE_UDPTIMEOUT;
  538                         nfsrc_unlock(rp);
  539                 } else {
  540                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  541                         if (rp->rc_refcnt > 0)
  542                                 nfsrc_unlock(rp);
  543                         else
  544                                 retrp = rp;
  545                 }
  546         } else {
  547                 nfsrc_freecache(rp);
  548                 mtx_unlock(mutex);
  549         }
  550 
  551 out:
  552         NFSEXITCODE2(0, nd);
  553         return (retrp);
  554 }
  555 
  556 /*
  557  * Invalidate and, if possible, free an in prog cache entry.
  558  * Must not sleep.
  559  */
  560 APPLESTATIC void
  561 nfsrvd_delcache(struct nfsrvcache *rp)
  562 {
  563         struct mtx *mutex;
  564 
  565         mutex = nfsrc_cachemutex(rp);
  566         if (!(rp->rc_flag & RC_INPROG))
  567                 panic("nfsrvd_delcache not in prog");
  568         mtx_lock(mutex);
  569         rp->rc_flag &= ~RC_INPROG;
  570         if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
  571                 nfsrc_freecache(rp);
  572         mtx_unlock(mutex);
  573 }
  574 
  575 /*
  576  * Called after nfsrvd_updatecache() once the reply is sent, to update
  577  * the entry's sequence number and unlock it. The argument is
  578  * the pointer returned by nfsrvd_updatecache().
  579  */
  580 APPLESTATIC void
  581 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
  582 {
  583         struct nfsrchash_bucket *hbp;
  584 
  585         KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
  586         if (have_seq) {
  587                 hbp = NFSRCAHASH(rp->rc_sockref);
  588                 mtx_lock(&hbp->mtx);
  589                 rp->rc_tcpseq = seq;
  590                 if (rp->rc_acked != RC_NO_ACK)
  591                         LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
  592                 rp->rc_acked = RC_NO_ACK;
  593                 mtx_unlock(&hbp->mtx);
  594         }
  595         nfsrc_unlock(rp);
  596 }
  597 
  598 /*
  599  * Get a cache entry for TCP
  600  * - key on <xid, nfs version>
  601  *   (allow multiple entries for a given key)
  602  */
  603 static int
  604 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
  605 {
  606         struct nfsrvcache *rp, *nextrp;
  607         int i;
  608         struct nfsrvcache *hitrp;
  609         struct nfsrvhashhead *hp, nfsrc_templist;
  610         int hit, ret = 0;
  611         struct mtx *mutex;
  612 
  613         mutex = nfsrc_cachemutex(newrp);
  614         hp = NFSRCHASH(newrp->rc_xid);
  615         newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
  616 tryagain:
  617         mtx_lock(mutex);
  618         hit = 1;
  619         LIST_INIT(&nfsrc_templist);
  620         /*
  621          * Get all the matches and put them on the temp list.
  622          */
  623         rp = LIST_FIRST(hp);
  624         while (rp != LIST_END(hp)) {
  625                 nextrp = LIST_NEXT(rp, rc_hash);
  626                 if (newrp->rc_xid == rp->rc_xid &&
  627                     (!(rp->rc_flag & RC_INPROG) ||
  628                      ((newrp->rc_flag & RC_SAMETCPCONN) &&
  629                       newrp->rc_sockref == rp->rc_sockref)) &&
  630                     (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
  631                     newrp->rc_proc == rp->rc_proc &&
  632                     ((newrp->rc_flag & RC_NFSV4) &&
  633                      newrp->rc_sockref != rp->rc_sockref &&
  634                      newrp->rc_cachetime >= rp->rc_cachetime)
  635                     && newrp->rc_reqlen == rp->rc_reqlen &&
  636                     newrp->rc_cksum == rp->rc_cksum) {
  637                         LIST_REMOVE(rp, rc_hash);
  638                         LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
  639                 }
  640                 rp = nextrp;
  641         }
  642 
  643         /*
  644          * Now, use nfsrc_templist to decide if there is a match.
  645          */
  646         i = 0;
  647         LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
  648                 i++;
  649                 if (rp->rc_refcnt > 0) {
  650                         hit = 0;
  651                         break;
  652                 }
  653         }
  654         /*
  655          * Can be a hit only if one entry left.
  656          * Note possible hit entry and put nfsrc_templist back on hash
  657          * list.
  658          */
  659         if (i != 1)
  660                 hit = 0;
  661         hitrp = rp = LIST_FIRST(&nfsrc_templist);
  662         while (rp != LIST_END(&nfsrc_templist)) {
  663                 nextrp = LIST_NEXT(rp, rc_hash);
  664                 LIST_REMOVE(rp, rc_hash);
  665                 LIST_INSERT_HEAD(hp, rp, rc_hash);
  666                 rp = nextrp;
  667         }
  668         if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
  669                 panic("nfs gettcp cache templist");
  670 
  671         if (hit) {
  672                 rp = hitrp;
  673                 if ((rp->rc_flag & RC_LOCKED) != 0) {
  674                         rp->rc_flag |= RC_WANTED;
  675                         (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
  676                             "nfsrc", 10 * hz);
  677                         goto tryagain;
  678                 }
  679                 if (rp->rc_flag == 0)
  680                         panic("nfs tcp cache0");
  681                 rp->rc_flag |= RC_LOCKED;
  682                 if (rp->rc_flag & RC_INPROG) {
  683                         nfsstatsv1.srvcache_inproghits++;
  684                         mtx_unlock(mutex);
  685                         if (newrp->rc_sockref == rp->rc_sockref)
  686                                 nfsrc_marksametcpconn(rp->rc_sockref);
  687                         ret = RC_DROPIT;
  688                 } else if (rp->rc_flag & RC_REPSTATUS) {
  689                         /*
  690                          * V2 only.
  691                          */
  692                         nfsstatsv1.srvcache_nonidemdonehits++;
  693                         mtx_unlock(mutex);
  694                         if (newrp->rc_sockref == rp->rc_sockref)
  695                                 nfsrc_marksametcpconn(rp->rc_sockref);
  696                         ret = RC_REPLY;
  697                         nfsrvd_rephead(nd);
  698                         *(nd->nd_errp) = rp->rc_status;
  699                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  700                 } else if (rp->rc_flag & RC_REPMBUF) {
  701                         nfsstatsv1.srvcache_nonidemdonehits++;
  702                         mtx_unlock(mutex);
  703                         if (newrp->rc_sockref == rp->rc_sockref)
  704                                 nfsrc_marksametcpconn(rp->rc_sockref);
  705                         ret = RC_REPLY;
  706                         nd->nd_mreq = m_copym(rp->rc_reply, 0,
  707                                 M_COPYALL, M_WAITOK);
  708                         rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
  709                 } else {
  710                         panic("nfs tcp cache1");
  711                 }
  712                 nfsrc_unlock(rp);
  713                 free(newrp, M_NFSRVCACHE);
  714                 goto out;
  715         }
  716         nfsstatsv1.srvcache_misses++;
  717         atomic_add_int(&nfsstatsv1.srvcache_size, 1);
  718 
  719         /*
  720          * For TCP, multiple entries for a key are allowed, so don't
  721          * chain it into the hash table until done.
  722          */
  723         newrp->rc_cachetime = NFSD_MONOSEC;
  724         newrp->rc_flag |= RC_INPROG;
  725         LIST_INSERT_HEAD(hp, newrp, rc_hash);
  726         mtx_unlock(mutex);
  727         nd->nd_rp = newrp;
  728         ret = RC_DOIT;
  729 
  730 out:
  731         NFSEXITCODE2(0, nd);
  732         return (ret);
  733 }
  734 
  735 /*
  736  * Lock a cache entry.
  737  */
  738 static void
  739 nfsrc_lock(struct nfsrvcache *rp)
  740 {
  741         struct mtx *mutex;
  742 
  743         mutex = nfsrc_cachemutex(rp);
  744         mtx_assert(mutex, MA_OWNED);
  745         while ((rp->rc_flag & RC_LOCKED) != 0) {
  746                 rp->rc_flag |= RC_WANTED;
  747                 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
  748         }
  749         rp->rc_flag |= RC_LOCKED;
  750 }
  751 
  752 /*
  753  * Unlock a cache entry.
  754  */
  755 static void
  756 nfsrc_unlock(struct nfsrvcache *rp)
  757 {
  758         struct mtx *mutex;
  759 
  760         mutex = nfsrc_cachemutex(rp);
  761         mtx_lock(mutex);
  762         rp->rc_flag &= ~RC_LOCKED;
  763         nfsrc_wanted(rp);
  764         mtx_unlock(mutex);
  765 }
  766 
  767 /*
  768  * Wakeup anyone wanting entry.
  769  */
  770 static void
  771 nfsrc_wanted(struct nfsrvcache *rp)
  772 {
  773         if (rp->rc_flag & RC_WANTED) {
  774                 rp->rc_flag &= ~RC_WANTED;
  775                 wakeup((caddr_t)rp);
  776         }
  777 }
  778 
  779 /*
  780  * Free up the entry.
  781  * Must not sleep.
  782  */
  783 static void
  784 nfsrc_freecache(struct nfsrvcache *rp)
  785 {
  786         struct nfsrchash_bucket *hbp;
  787 
  788         LIST_REMOVE(rp, rc_hash);
  789         if (rp->rc_flag & RC_UDP) {
  790                 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
  791                 nfsrc_udpcachesize--;
  792         } else if (rp->rc_acked != RC_NO_SEQ) {
  793                 hbp = NFSRCAHASH(rp->rc_sockref);
  794                 mtx_lock(&hbp->mtx);
  795                 if (rp->rc_acked == RC_NO_ACK)
  796                         LIST_REMOVE(rp, rc_ahash);
  797                 mtx_unlock(&hbp->mtx);
  798         }
  799         nfsrc_wanted(rp);
  800         if (rp->rc_flag & RC_REPMBUF) {
  801                 mbuf_freem(rp->rc_reply);
  802                 if (!(rp->rc_flag & RC_UDP))
  803                         atomic_add_int(&nfsrc_tcpsavedreplies, -1);
  804         }
  805         free(rp, M_NFSRVCACHE);
  806         atomic_add_int(&nfsstatsv1.srvcache_size, -1);
  807 }
  808 
  809 /*
  810  * Clean out the cache. Called when nfsserver module is unloaded.
  811  */
  812 APPLESTATIC void
  813 nfsrvd_cleancache(void)
  814 {
  815         struct nfsrvcache *rp, *nextrp;
  816         int i;
  817 
  818         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  819                 mtx_lock(&nfsrchash_table[i].mtx);
  820                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
  821                         nfsrc_freecache(rp);
  822                 mtx_unlock(&nfsrchash_table[i].mtx);
  823         }
  824         mtx_lock(&nfsrc_udpmtx);
  825         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  826                 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
  827                         nfsrc_freecache(rp);
  828                 }
  829         }
  830         nfsstatsv1.srvcache_size = 0;
  831         mtx_unlock(&nfsrc_udpmtx);
  832         nfsrc_tcpsavedreplies = 0;
  833 }
  834 
  835 #define HISTSIZE        16
  836 /*
  837  * The basic rule is to get rid of entries that are expired.
  838  */
  839 void
  840 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
  841 {
  842         struct nfsrchash_bucket *hbp;
  843         struct nfsrvcache *rp, *nextrp;
  844         int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
  845         time_t thisstamp;
  846         static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
  847         static int onethread = 0, oneslot = 0;
  848 
  849         if (sockref != 0) {
  850                 hbp = NFSRCAHASH(sockref);
  851                 mtx_lock(&hbp->mtx);
  852                 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
  853                         if (sockref == rp->rc_sockref) {
  854                                 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
  855                                         rp->rc_acked = RC_ACK;
  856                                         LIST_REMOVE(rp, rc_ahash);
  857                                 } else if (final) {
  858                                         rp->rc_acked = RC_NACK;
  859                                         LIST_REMOVE(rp, rc_ahash);
  860                                 }
  861                         }
  862                 }
  863                 mtx_unlock(&hbp->mtx);
  864         }
  865 
  866         if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
  867                 return;
  868         if (NFSD_MONOSEC != udp_lasttrim ||
  869             nfsrc_udpcachesize >= (nfsrc_udphighwater +
  870             nfsrc_udphighwater / 2)) {
  871                 mtx_lock(&nfsrc_udpmtx);
  872                 udp_lasttrim = NFSD_MONOSEC;
  873                 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
  874                         if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
  875                              && rp->rc_refcnt == 0
  876                              && ((rp->rc_flag & RC_REFCNT) ||
  877                                  udp_lasttrim > rp->rc_timestamp ||
  878                                  nfsrc_udpcachesize > nfsrc_udphighwater))
  879                                 nfsrc_freecache(rp);
  880                 }
  881                 mtx_unlock(&nfsrc_udpmtx);
  882         }
  883         if (NFSD_MONOSEC != tcp_lasttrim ||
  884             nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
  885                 force = nfsrc_tcphighwater / 4;
  886                 if (force > 0 &&
  887                     nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
  888                         for (i = 0; i < HISTSIZE; i++)
  889                                 time_histo[i] = 0;
  890                         i = 0;
  891                         lastslot = NFSRVCACHE_HASHSIZE - 1;
  892                 } else {
  893                         force = 0;
  894                         if (NFSD_MONOSEC != tcp_lasttrim) {
  895                                 i = 0;
  896                                 lastslot = NFSRVCACHE_HASHSIZE - 1;
  897                         } else {
  898                                 lastslot = i = oneslot;
  899                                 if (++oneslot >= NFSRVCACHE_HASHSIZE)
  900                                         oneslot = 0;
  901                         }
  902                 }
  903                 tto = nfsrc_tcptimeout;
  904                 tcp_lasttrim = NFSD_MONOSEC;
  905                 for (; i <= lastslot; i++) {
  906                         mtx_lock(&nfsrchash_table[i].mtx);
  907                         LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
  908                             nextrp) {
  909                                 if (!(rp->rc_flag &
  910                                      (RC_INPROG|RC_LOCKED|RC_WANTED))
  911                                      && rp->rc_refcnt == 0) {
  912                                         if ((rp->rc_flag & RC_REFCNT) ||
  913                                             tcp_lasttrim > rp->rc_timestamp ||
  914                                             rp->rc_acked == RC_ACK) {
  915                                                 nfsrc_freecache(rp);
  916                                                 continue;
  917                                         }
  918 
  919                                         if (force == 0)
  920                                                 continue;
  921                                         /*
  922                                          * The timestamps range from roughly the
  923                                          * present (tcp_lasttrim) to the present
  924                                          * + nfsrc_tcptimeout. Generate a simple
  925                                          * histogram of where the timeouts fall.
  926                                          */
  927                                         j = rp->rc_timestamp - tcp_lasttrim;
  928                                         if (j >= tto)
  929                                                 j = HISTSIZE - 1;
  930                                         else if (j < 0)
  931                                                 j = 0;
  932                                         else
  933                                                 j = j * HISTSIZE / tto;
  934                                         time_histo[j]++;
  935                                 }
  936                         }
  937                         mtx_unlock(&nfsrchash_table[i].mtx);
  938                 }
  939                 if (force) {
  940                         /*
  941                          * Trim some more with a smaller timeout of as little
  942                          * as 20% of nfsrc_tcptimeout to try and get below
  943                          * 80% of the nfsrc_tcphighwater.
  944                          */
  945                         k = 0;
  946                         for (i = 0; i < (HISTSIZE - 2); i++) {
  947                                 k += time_histo[i];
  948                                 if (k > force)
  949                                         break;
  950                         }
  951                         k = tto * (i + 1) / HISTSIZE;
  952                         if (k < 1)
  953                                 k = 1;
  954                         thisstamp = tcp_lasttrim + k;
  955                         for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
  956                                 mtx_lock(&nfsrchash_table[i].mtx);
  957                                 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
  958                                     rc_hash, nextrp) {
  959                                         if (!(rp->rc_flag &
  960                                              (RC_INPROG|RC_LOCKED|RC_WANTED))
  961                                              && rp->rc_refcnt == 0
  962                                              && ((rp->rc_flag & RC_REFCNT) ||
  963                                                  thisstamp > rp->rc_timestamp ||
  964                                                  rp->rc_acked == RC_ACK))
  965                                                 nfsrc_freecache(rp);
  966                                 }
  967                                 mtx_unlock(&nfsrchash_table[i].mtx);
  968                         }
  969                 }
  970         }
  971         atomic_store_rel_int(&onethread, 0);
  972 }
  973 
  974 /*
  975  * Add a seqid# reference to the cache entry.
  976  */
  977 APPLESTATIC void
  978 nfsrvd_refcache(struct nfsrvcache *rp)
  979 {
  980         struct mtx *mutex;
  981 
  982         if (rp == NULL)
  983                 /* For NFSv4.1, there is no cache entry. */
  984                 return;
  985         mutex = nfsrc_cachemutex(rp);
  986         mtx_lock(mutex);
  987         if (rp->rc_refcnt < 0)
  988                 panic("nfs cache refcnt");
  989         rp->rc_refcnt++;
  990         mtx_unlock(mutex);
  991 }
  992 
  993 /*
  994  * Dereference a seqid# cache entry.
  995  */
  996 APPLESTATIC void
  997 nfsrvd_derefcache(struct nfsrvcache *rp)
  998 {
  999         struct mtx *mutex;
 1000 
 1001         mutex = nfsrc_cachemutex(rp);
 1002         mtx_lock(mutex);
 1003         if (rp->rc_refcnt <= 0)
 1004                 panic("nfs cache derefcnt");
 1005         rp->rc_refcnt--;
 1006         if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
 1007                 nfsrc_freecache(rp);
 1008         mtx_unlock(mutex);
 1009 }
 1010 
 1011 /*
 1012  * Calculate the length of the mbuf list and a checksum on the first up to
 1013  * NFSRVCACHE_CHECKLEN bytes.
 1014  */
 1015 static int
 1016 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
 1017 {
 1018         int len = 0, cklen;
 1019         mbuf_t m;
 1020 
 1021         m = m1;
 1022         while (m) {
 1023                 len += mbuf_len(m);
 1024                 m = mbuf_next(m);
 1025         }
 1026         cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
 1027         *cksum = in_cksum(m1, cklen);
 1028         return (len);
 1029 }
 1030 
 1031 /*
 1032  * Mark a TCP connection that is seeing retries. Should never happen for
 1033  * NFSv4.
 1034  */
 1035 static void
 1036 nfsrc_marksametcpconn(u_int64_t sockref)
 1037 {
 1038 }
 1039
Cache object: cb4675bd5af7e07b14c797cd04f2747c
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/fs/nfsserver/nfs_nfsdcache.c

FreeBSD/Linux Kernel Cross Reference
sys/fs/nfsserver/nfs_nfsdcache.c