1 /*-
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD: releng/10.0/sys/fs/nfsserver/nfs_nfsdcache.c 254337 2013-08-14 21:11:26Z rmacklem $");
36
37 /*
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42 * avoided at all cost
43 * - A valid hit will probably happen a long time after the original reply
44 * and the TCP socket that the original request was received on will no
45 * longer be active
46 * (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 * in them as well as minimizing the risk of redoing retried non-idempotent
49 * Ops.
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
53 *
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
56 * For TCP
57 * - key on <xid, NFS version> (as noted above, there can be several
58 * entries with the same key)
59 * When a request arrives:
60 * For all that match key
61 * - if RPC# != OR request_size !=
62 * - not a match with this one
63 * - if NFSv4 and received on same TCP socket OR
64 * received on a TCP connection created before the
65 * entry was cached
66 * - not a match with this one
67 * (V2,3 clients might retry on same TCP socket)
68 * - calculate checksum on first N bytes of NFS XDR
69 * - if checksum !=
70 * - not a match for this one
71 * If any of the remaining ones that match has a
72 * seqid_refcnt > 0
73 * - not a match (go do RPC, using new cache entry)
74 * If one match left
75 * - a hit (reply from cache)
76 * else
77 * - miss (go do RPC, using new cache entry)
78 *
79 * During processing of NFSv4 request:
80 * - set a flag when a non-idempotent Op is processed
81 * - when an Op that uses a seqid# (Open,...) is processed
82 * - if same seqid# as referenced entry in cache
83 * - free new cache entry
84 * - reply from referenced cache entry
85 * else if next seqid# in order
86 * - free referenced cache entry
87 * - increment seqid_refcnt on new cache entry
88 * - set pointer from Openowner/Lockowner to
89 * new cache entry (aka reference it)
90 * else if first seqid# in sequence
91 * - increment seqid_refcnt on new cache entry
92 * - set pointer from Openowner/Lockowner to
93 * new cache entry (aka reference it)
94 *
95 * At end of RPC processing:
96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
97 * cache entry
98 * - save reply in cache entry
99 * - calculate checksum on first N bytes of NFS XDR
100 * request
101 * - note op and length of XDR request (in bytes)
102 * - timestamp it
103 * else
104 * - free new cache entry
105 * - Send reply (noting info for socket activity check, below)
106 *
107 * For cache entries saved above:
108 * - if saved since seqid_refcnt was > 0
109 * - free when seqid_refcnt decrements to 0
110 * (when next one in sequence is processed above, or
111 * when Openowner/Lockowner is discarded)
112 * else { non-idempotent Op(s) }
113 * - free when
114 * - some further activity observed on same
115 * socket
116 * (I'm not yet sure how I'm going to do
117 * this. Maybe look at the TCP connection
118 * to see if the send_tcp_sequence# is well
119 * past sent reply OR K additional RPCs
120 * replied on same socket OR?)
121 * OR
122 * - when very old (hours, days, weeks?)
123 *
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 * (at most one entry for each key)
127 *
128 * When a Request arrives:
129 * - if a match with entry via key
130 * - if RPC marked In_progress
131 * - discard request (don't send reply)
132 * else
133 * - reply from cache
134 * - timestamp cache entry
135 * else
136 * - add entry to cache, marked In_progress
137 * - do RPC
138 * - when RPC done
139 * - if RPC# non-idempotent
140 * - mark entry Done (not In_progress)
141 * - save reply
142 * - timestamp cache entry
143 * else
144 * - free cache entry
145 * - send reply
146 *
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 * pages 53-63. San Diego, February 1989.
152 * for the UDP case.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 * for TCP. For V3, a reply won't be saved when the flood level is
155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 * that case. This level should be set high enough that this almost
157 * never happens.
158 */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161
162 extern struct nfsstats newnfsstats;
163 extern struct mtx nfsrc_udpmtx;
164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
166 #endif /* !APPLEKEXT */
167
168 SYSCTL_DECL(_vfs_nfsd);
169
170 static u_int nfsrc_tcphighwater = 0;
171 static int
172 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
173 {
174 int error, newhighwater;
175
176 newhighwater = nfsrc_tcphighwater;
177 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
178 if (error != 0 || req->newptr == NULL)
179 return (error);
180 if (newhighwater < 0)
181 return (EINVAL);
182 if (newhighwater >= nfsrc_floodlevel)
183 nfsrc_floodlevel = newhighwater + newhighwater / 5;
184 nfsrc_tcphighwater = newhighwater;
185 return (0);
186 }
187 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
188 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
189 "High water mark for TCP cache entries");
190
191 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
192 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
193 &nfsrc_udphighwater, 0,
194 "High water mark for UDP cache entries");
195 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
196 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
197 &nfsrc_tcptimeout, 0,
198 "Timeout for TCP entries in the DRC");
199 static u_int nfsrc_tcpnonidempotent = 1;
200 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
201 &nfsrc_tcpnonidempotent, 0,
202 "Enable the DRC for NFS over TCP");
203
204 static int nfsrc_udpcachesize = 0;
205 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
206 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
207
208 /*
209 * and the reverse mapping from generic to Version 2 procedure numbers
210 */
211 static int newnfsv2_procid[NFS_V3NPROCS] = {
212 NFSV2PROC_NULL,
213 NFSV2PROC_GETATTR,
214 NFSV2PROC_SETATTR,
215 NFSV2PROC_LOOKUP,
216 NFSV2PROC_NOOP,
217 NFSV2PROC_READLINK,
218 NFSV2PROC_READ,
219 NFSV2PROC_WRITE,
220 NFSV2PROC_CREATE,
221 NFSV2PROC_MKDIR,
222 NFSV2PROC_SYMLINK,
223 NFSV2PROC_CREATE,
224 NFSV2PROC_REMOVE,
225 NFSV2PROC_RMDIR,
226 NFSV2PROC_RENAME,
227 NFSV2PROC_LINK,
228 NFSV2PROC_READDIR,
229 NFSV2PROC_NOOP,
230 NFSV2PROC_STATFS,
231 NFSV2PROC_NOOP,
232 NFSV2PROC_NOOP,
233 NFSV2PROC_NOOP,
234 };
235
236 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
237 #define NFSRCUDPHASH(xid) \
238 (&nfsrvudphashtbl[nfsrc_hash(xid)])
239 #define NFSRCHASH(xid) \
240 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
241 #define TRUE 1
242 #define FALSE 0
243 #define NFSRVCACHE_CHECKLEN 100
244
245 /* True iff the rpc reply is an nfs status ONLY! */
246 static int nfsv2_repstat[NFS_V3NPROCS] = {
247 FALSE,
248 FALSE,
249 FALSE,
250 FALSE,
251 FALSE,
252 FALSE,
253 FALSE,
254 FALSE,
255 FALSE,
256 FALSE,
257 TRUE,
258 TRUE,
259 TRUE,
260 TRUE,
261 FALSE,
262 TRUE,
263 FALSE,
264 FALSE,
265 FALSE,
266 FALSE,
267 FALSE,
268 FALSE,
269 };
270
271 /*
272 * Will NFS want to work over IPv6 someday?
273 */
274 #define NETFAMILY(rp) \
275 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
276
277 /* local functions */
278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
280 static void nfsrc_lock(struct nfsrvcache *rp);
281 static void nfsrc_unlock(struct nfsrvcache *rp);
282 static void nfsrc_wanted(struct nfsrvcache *rp);
283 static void nfsrc_freecache(struct nfsrvcache *rp);
284 static void nfsrc_trimcache(u_int64_t, struct socket *);
285 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
286 struct socket *);
287 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
288 static void nfsrc_marksametcpconn(u_int64_t);
289
290 /*
291 * Return the correct mutex for this cache entry.
292 */
293 static __inline struct mtx *
294 nfsrc_cachemutex(struct nfsrvcache *rp)
295 {
296
297 if ((rp->rc_flag & RC_UDP) != 0)
298 return (&nfsrc_udpmtx);
299 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
300 }
301
302 /*
303 * Initialize the server request cache list
304 */
305 APPLESTATIC void
306 nfsrvd_initcache(void)
307 {
308 int i;
309 static int inited = 0;
310
311 if (inited)
312 return;
313 inited = 1;
314 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
315 LIST_INIT(&nfsrvudphashtbl[i]);
316 LIST_INIT(&nfsrchash_table[i].tbl);
317 }
318 TAILQ_INIT(&nfsrvudplru);
319 nfsrc_tcpsavedreplies = 0;
320 nfsrc_udpcachesize = 0;
321 newnfsstats.srvcache_tcppeak = 0;
322 newnfsstats.srvcache_size = 0;
323 }
324
325 /*
326 * Get a cache entry for this request. Basically just malloc a new one
327 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328 * Call nfsrc_trimcache() to clean up the cache before returning.
329 */
330 APPLESTATIC int
331 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
332 {
333 struct nfsrvcache *newrp;
334 int ret;
335
336 if (nd->nd_procnum == NFSPROC_NULL)
337 panic("nfsd cache null");
338 MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
339 M_NFSRVCACHE, M_WAITOK);
340 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
341 if (nd->nd_flag & ND_NFSV4)
342 newrp->rc_flag = RC_NFSV4;
343 else if (nd->nd_flag & ND_NFSV3)
344 newrp->rc_flag = RC_NFSV3;
345 else
346 newrp->rc_flag = RC_NFSV2;
347 newrp->rc_xid = nd->nd_retxid;
348 newrp->rc_proc = nd->nd_procnum;
349 newrp->rc_sockref = nd->nd_sockref;
350 newrp->rc_cachetime = nd->nd_tcpconntime;
351 if (nd->nd_flag & ND_SAMETCPCONN)
352 newrp->rc_flag |= RC_SAMETCPCONN;
353 if (nd->nd_nam2 != NULL) {
354 newrp->rc_flag |= RC_UDP;
355 ret = nfsrc_getudp(nd, newrp);
356 } else {
357 ret = nfsrc_gettcp(nd, newrp);
358 }
359 nfsrc_trimcache(nd->nd_sockref, so);
360 NFSEXITCODE2(0, nd);
361 return (ret);
362 }
363
364 /*
365 * For UDP (v2, v3):
366 * - key on <xid, NFS version, RPC#, Client host ip#>
367 * (at most one entry for each key)
368 */
369 static int
370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371 {
372 struct nfsrvcache *rp;
373 struct sockaddr_in *saddr;
374 struct sockaddr_in6 *saddr6;
375 struct nfsrvhashhead *hp;
376 int ret = 0;
377 struct mtx *mutex;
378
379 mutex = nfsrc_cachemutex(newrp);
380 hp = NFSRCUDPHASH(newrp->rc_xid);
381 loop:
382 mtx_lock(mutex);
383 LIST_FOREACH(rp, hp, rc_hash) {
384 if (newrp->rc_xid == rp->rc_xid &&
385 newrp->rc_proc == rp->rc_proc &&
386 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388 if ((rp->rc_flag & RC_LOCKED) != 0) {
389 rp->rc_flag |= RC_WANTED;
390 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391 "nfsrc", 10 * hz);
392 goto loop;
393 }
394 if (rp->rc_flag == 0)
395 panic("nfs udp cache0");
396 rp->rc_flag |= RC_LOCKED;
397 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399 if (rp->rc_flag & RC_INPROG) {
400 newnfsstats.srvcache_inproghits++;
401 mtx_unlock(mutex);
402 ret = RC_DROPIT;
403 } else if (rp->rc_flag & RC_REPSTATUS) {
404 /*
405 * V2 only.
406 */
407 newnfsstats.srvcache_nonidemdonehits++;
408 mtx_unlock(mutex);
409 nfsrvd_rephead(nd);
410 *(nd->nd_errp) = rp->rc_status;
411 ret = RC_REPLY;
412 rp->rc_timestamp = NFSD_MONOSEC +
413 NFSRVCACHE_UDPTIMEOUT;
414 } else if (rp->rc_flag & RC_REPMBUF) {
415 newnfsstats.srvcache_nonidemdonehits++;
416 mtx_unlock(mutex);
417 nd->nd_mreq = m_copym(rp->rc_reply, 0,
418 M_COPYALL, M_WAITOK);
419 ret = RC_REPLY;
420 rp->rc_timestamp = NFSD_MONOSEC +
421 NFSRVCACHE_UDPTIMEOUT;
422 } else {
423 panic("nfs udp cache1");
424 }
425 nfsrc_unlock(rp);
426 free((caddr_t)newrp, M_NFSRVCACHE);
427 goto out;
428 }
429 }
430 newnfsstats.srvcache_misses++;
431 atomic_add_int(&newnfsstats.srvcache_size, 1);
432 nfsrc_udpcachesize++;
433
434 newrp->rc_flag |= RC_INPROG;
435 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436 if (saddr->sin_family == AF_INET)
437 newrp->rc_inet = saddr->sin_addr.s_addr;
438 else if (saddr->sin_family == AF_INET6) {
439 saddr6 = (struct sockaddr_in6 *)saddr;
440 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441 sizeof (struct in6_addr));
442 newrp->rc_flag |= RC_INETIPV6;
443 }
444 LIST_INSERT_HEAD(hp, newrp, rc_hash);
445 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446 mtx_unlock(mutex);
447 nd->nd_rp = newrp;
448 ret = RC_DOIT;
449
450 out:
451 NFSEXITCODE2(0, nd);
452 return (ret);
453 }
454
455 /*
456 * Update a request cache entry after the rpc has been done
457 */
458 APPLESTATIC struct nfsrvcache *
459 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
460 {
461 struct nfsrvcache *rp;
462 struct nfsrvcache *retrp = NULL;
463 mbuf_t m;
464 struct mtx *mutex;
465
466 rp = nd->nd_rp;
467 if (!rp)
468 panic("nfsrvd_updatecache null rp");
469 nd->nd_rp = NULL;
470 mutex = nfsrc_cachemutex(rp);
471 mtx_lock(mutex);
472 nfsrc_lock(rp);
473 if (!(rp->rc_flag & RC_INPROG))
474 panic("nfsrvd_updatecache not inprog");
475 rp->rc_flag &= ~RC_INPROG;
476 if (rp->rc_flag & RC_UDP) {
477 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479 }
480
481 /*
482 * Reply from cache is a special case returned by nfsrv_checkseqid().
483 */
484 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485 newnfsstats.srvcache_nonidemdonehits++;
486 mtx_unlock(mutex);
487 nd->nd_repstat = 0;
488 if (nd->nd_mreq)
489 mbuf_freem(nd->nd_mreq);
490 if (!(rp->rc_flag & RC_REPMBUF))
491 panic("reply from cache");
492 nd->nd_mreq = m_copym(rp->rc_reply, 0,
493 M_COPYALL, M_WAITOK);
494 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
495 nfsrc_unlock(rp);
496 goto out;
497 }
498
499 /*
500 * If rc_refcnt > 0, save it
501 * For UDP, save it if ND_SAVEREPLY is set
502 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
503 */
504 if (nd->nd_repstat != NFSERR_DONTREPLY &&
505 (rp->rc_refcnt > 0 ||
506 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509 nfsrc_tcpnonidempotent))) {
510 if (rp->rc_refcnt > 0) {
511 if (!(rp->rc_flag & RC_NFSV4))
512 panic("update_cache refcnt");
513 rp->rc_flag |= RC_REFCNT;
514 }
515 if ((nd->nd_flag & ND_NFSV2) &&
516 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517 rp->rc_status = nd->nd_repstat;
518 rp->rc_flag |= RC_REPSTATUS;
519 mtx_unlock(mutex);
520 } else {
521 if (!(rp->rc_flag & RC_UDP)) {
522 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523 if (nfsrc_tcpsavedreplies >
524 newnfsstats.srvcache_tcppeak)
525 newnfsstats.srvcache_tcppeak =
526 nfsrc_tcpsavedreplies;
527 }
528 mtx_unlock(mutex);
529 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
530 mtx_lock(mutex);
531 rp->rc_reply = m;
532 rp->rc_flag |= RC_REPMBUF;
533 mtx_unlock(mutex);
534 }
535 if (rp->rc_flag & RC_UDP) {
536 rp->rc_timestamp = NFSD_MONOSEC +
537 NFSRVCACHE_UDPTIMEOUT;
538 nfsrc_unlock(rp);
539 } else {
540 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541 if (rp->rc_refcnt > 0)
542 nfsrc_unlock(rp);
543 else
544 retrp = rp;
545 }
546 } else {
547 nfsrc_freecache(rp);
548 mtx_unlock(mutex);
549 }
550
551 out:
552 nfsrc_trimcache(nd->nd_sockref, so);
553 NFSEXITCODE2(0, nd);
554 return (retrp);
555 }
556
557 /*
558 * Invalidate and, if possible, free an in prog cache entry.
559 * Must not sleep.
560 */
561 APPLESTATIC void
562 nfsrvd_delcache(struct nfsrvcache *rp)
563 {
564 struct mtx *mutex;
565
566 mutex = nfsrc_cachemutex(rp);
567 if (!(rp->rc_flag & RC_INPROG))
568 panic("nfsrvd_delcache not in prog");
569 mtx_lock(mutex);
570 rp->rc_flag &= ~RC_INPROG;
571 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
572 nfsrc_freecache(rp);
573 mtx_unlock(mutex);
574 }
575
576 /*
577 * Called after nfsrvd_updatecache() once the reply is sent, to update
578 * the entry for nfsrc_activesocket() and unlock it. The argument is
579 * the pointer returned by nfsrvd_updatecache().
580 */
581 APPLESTATIC void
582 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
583 {
584 tcp_seq tmp_seq;
585 struct mtx *mutex;
586
587 mutex = nfsrc_cachemutex(rp);
588 if (!(rp->rc_flag & RC_LOCKED))
589 panic("nfsrvd_sentcache not locked");
590 if (!err) {
591 if ((so->so_proto->pr_domain->dom_family != AF_INET &&
592 so->so_proto->pr_domain->dom_family != AF_INET6) ||
593 so->so_proto->pr_protocol != IPPROTO_TCP)
594 panic("nfs sent cache");
595 if (nfsrv_getsockseqnum(so, &tmp_seq)) {
596 mtx_lock(mutex);
597 rp->rc_tcpseq = tmp_seq;
598 rp->rc_flag |= RC_TCPSEQ;
599 mtx_unlock(mutex);
600 }
601 }
602 nfsrc_unlock(rp);
603 }
604
605 /*
606 * Get a cache entry for TCP
607 * - key on <xid, nfs version>
608 * (allow multiple entries for a given key)
609 */
610 static int
611 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
612 {
613 struct nfsrvcache *rp, *nextrp;
614 int i;
615 struct nfsrvcache *hitrp;
616 struct nfsrvhashhead *hp, nfsrc_templist;
617 int hit, ret = 0;
618 struct mtx *mutex;
619
620 mutex = nfsrc_cachemutex(newrp);
621 hp = NFSRCHASH(newrp->rc_xid);
622 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
623 tryagain:
624 mtx_lock(mutex);
625 hit = 1;
626 LIST_INIT(&nfsrc_templist);
627 /*
628 * Get all the matches and put them on the temp list.
629 */
630 rp = LIST_FIRST(hp);
631 while (rp != LIST_END(hp)) {
632 nextrp = LIST_NEXT(rp, rc_hash);
633 if (newrp->rc_xid == rp->rc_xid &&
634 (!(rp->rc_flag & RC_INPROG) ||
635 ((newrp->rc_flag & RC_SAMETCPCONN) &&
636 newrp->rc_sockref == rp->rc_sockref)) &&
637 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
638 newrp->rc_proc == rp->rc_proc &&
639 ((newrp->rc_flag & RC_NFSV4) &&
640 newrp->rc_sockref != rp->rc_sockref &&
641 newrp->rc_cachetime >= rp->rc_cachetime)
642 && newrp->rc_reqlen == rp->rc_reqlen &&
643 newrp->rc_cksum == rp->rc_cksum) {
644 LIST_REMOVE(rp, rc_hash);
645 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
646 }
647 rp = nextrp;
648 }
649
650 /*
651 * Now, use nfsrc_templist to decide if there is a match.
652 */
653 i = 0;
654 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
655 i++;
656 if (rp->rc_refcnt > 0) {
657 hit = 0;
658 break;
659 }
660 }
661 /*
662 * Can be a hit only if one entry left.
663 * Note possible hit entry and put nfsrc_templist back on hash
664 * list.
665 */
666 if (i != 1)
667 hit = 0;
668 hitrp = rp = LIST_FIRST(&nfsrc_templist);
669 while (rp != LIST_END(&nfsrc_templist)) {
670 nextrp = LIST_NEXT(rp, rc_hash);
671 LIST_REMOVE(rp, rc_hash);
672 LIST_INSERT_HEAD(hp, rp, rc_hash);
673 rp = nextrp;
674 }
675 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
676 panic("nfs gettcp cache templist");
677
678 if (hit) {
679 rp = hitrp;
680 if ((rp->rc_flag & RC_LOCKED) != 0) {
681 rp->rc_flag |= RC_WANTED;
682 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
683 "nfsrc", 10 * hz);
684 goto tryagain;
685 }
686 if (rp->rc_flag == 0)
687 panic("nfs tcp cache0");
688 rp->rc_flag |= RC_LOCKED;
689 if (rp->rc_flag & RC_INPROG) {
690 newnfsstats.srvcache_inproghits++;
691 mtx_unlock(mutex);
692 if (newrp->rc_sockref == rp->rc_sockref)
693 nfsrc_marksametcpconn(rp->rc_sockref);
694 ret = RC_DROPIT;
695 } else if (rp->rc_flag & RC_REPSTATUS) {
696 /*
697 * V2 only.
698 */
699 newnfsstats.srvcache_nonidemdonehits++;
700 mtx_unlock(mutex);
701 if (newrp->rc_sockref == rp->rc_sockref)
702 nfsrc_marksametcpconn(rp->rc_sockref);
703 ret = RC_REPLY;
704 nfsrvd_rephead(nd);
705 *(nd->nd_errp) = rp->rc_status;
706 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707 } else if (rp->rc_flag & RC_REPMBUF) {
708 newnfsstats.srvcache_nonidemdonehits++;
709 mtx_unlock(mutex);
710 if (newrp->rc_sockref == rp->rc_sockref)
711 nfsrc_marksametcpconn(rp->rc_sockref);
712 ret = RC_REPLY;
713 nd->nd_mreq = m_copym(rp->rc_reply, 0,
714 M_COPYALL, M_WAITOK);
715 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
716 } else {
717 panic("nfs tcp cache1");
718 }
719 nfsrc_unlock(rp);
720 free((caddr_t)newrp, M_NFSRVCACHE);
721 goto out;
722 }
723 newnfsstats.srvcache_misses++;
724 atomic_add_int(&newnfsstats.srvcache_size, 1);
725
726 /*
727 * For TCP, multiple entries for a key are allowed, so don't
728 * chain it into the hash table until done.
729 */
730 newrp->rc_cachetime = NFSD_MONOSEC;
731 newrp->rc_flag |= RC_INPROG;
732 LIST_INSERT_HEAD(hp, newrp, rc_hash);
733 mtx_unlock(mutex);
734 nd->nd_rp = newrp;
735 ret = RC_DOIT;
736
737 out:
738 NFSEXITCODE2(0, nd);
739 return (ret);
740 }
741
742 /*
743 * Lock a cache entry.
744 */
745 static void
746 nfsrc_lock(struct nfsrvcache *rp)
747 {
748 struct mtx *mutex;
749
750 mutex = nfsrc_cachemutex(rp);
751 mtx_assert(mutex, MA_OWNED);
752 while ((rp->rc_flag & RC_LOCKED) != 0) {
753 rp->rc_flag |= RC_WANTED;
754 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
755 }
756 rp->rc_flag |= RC_LOCKED;
757 }
758
759 /*
760 * Unlock a cache entry.
761 */
762 static void
763 nfsrc_unlock(struct nfsrvcache *rp)
764 {
765 struct mtx *mutex;
766
767 mutex = nfsrc_cachemutex(rp);
768 mtx_lock(mutex);
769 rp->rc_flag &= ~RC_LOCKED;
770 nfsrc_wanted(rp);
771 mtx_unlock(mutex);
772 }
773
774 /*
775 * Wakeup anyone wanting entry.
776 */
777 static void
778 nfsrc_wanted(struct nfsrvcache *rp)
779 {
780 if (rp->rc_flag & RC_WANTED) {
781 rp->rc_flag &= ~RC_WANTED;
782 wakeup((caddr_t)rp);
783 }
784 }
785
786 /*
787 * Free up the entry.
788 * Must not sleep.
789 */
790 static void
791 nfsrc_freecache(struct nfsrvcache *rp)
792 {
793
794 LIST_REMOVE(rp, rc_hash);
795 if (rp->rc_flag & RC_UDP) {
796 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
797 nfsrc_udpcachesize--;
798 }
799 nfsrc_wanted(rp);
800 if (rp->rc_flag & RC_REPMBUF) {
801 mbuf_freem(rp->rc_reply);
802 if (!(rp->rc_flag & RC_UDP))
803 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
804 }
805 FREE((caddr_t)rp, M_NFSRVCACHE);
806 atomic_add_int(&newnfsstats.srvcache_size, -1);
807 }
808
809 /*
810 * Clean out the cache. Called when nfsserver module is unloaded.
811 */
812 APPLESTATIC void
813 nfsrvd_cleancache(void)
814 {
815 struct nfsrvcache *rp, *nextrp;
816 int i;
817
818 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819 mtx_lock(&nfsrchash_table[i].mtx);
820 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
821 nfsrc_freecache(rp);
822 mtx_unlock(&nfsrchash_table[i].mtx);
823 }
824 mtx_lock(&nfsrc_udpmtx);
825 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
827 nfsrc_freecache(rp);
828 }
829 }
830 newnfsstats.srvcache_size = 0;
831 mtx_unlock(&nfsrc_udpmtx);
832 nfsrc_tcpsavedreplies = 0;
833 }
834
835 /*
836 * The basic rule is to get rid of entries that are expired.
837 */
838 static void
839 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
840 {
841 struct nfsrvcache *rp, *nextrp;
842 int i, j, k, time_histo[10];
843 time_t thisstamp;
844 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845 static int onethread = 0;
846
847 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
848 return;
849 if (NFSD_MONOSEC != udp_lasttrim ||
850 nfsrc_udpcachesize >= (nfsrc_udphighwater +
851 nfsrc_udphighwater / 2)) {
852 mtx_lock(&nfsrc_udpmtx);
853 udp_lasttrim = NFSD_MONOSEC;
854 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
855 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
856 && rp->rc_refcnt == 0
857 && ((rp->rc_flag & RC_REFCNT) ||
858 udp_lasttrim > rp->rc_timestamp ||
859 nfsrc_udpcachesize > nfsrc_udphighwater))
860 nfsrc_freecache(rp);
861 }
862 mtx_unlock(&nfsrc_udpmtx);
863 }
864 if (NFSD_MONOSEC != tcp_lasttrim ||
865 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
866 for (i = 0; i < 10; i++)
867 time_histo[i] = 0;
868 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
869 mtx_lock(&nfsrchash_table[i].mtx);
870 if (i == 0)
871 tcp_lasttrim = NFSD_MONOSEC;
872 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
873 nextrp) {
874 if (!(rp->rc_flag &
875 (RC_INPROG|RC_LOCKED|RC_WANTED))
876 && rp->rc_refcnt == 0) {
877 /*
878 * The timestamps range from roughly the
879 * present (tcp_lasttrim) to the present
880 * + nfsrc_tcptimeout. Generate a simple
881 * histogram of where the timeouts fall.
882 */
883 j = rp->rc_timestamp - tcp_lasttrim;
884 if (j >= nfsrc_tcptimeout)
885 j = nfsrc_tcptimeout - 1;
886 if (j < 0)
887 j = 0;
888 j = (j * 10 / nfsrc_tcptimeout) % 10;
889 time_histo[j]++;
890 if ((rp->rc_flag & RC_REFCNT) ||
891 tcp_lasttrim > rp->rc_timestamp ||
892 nfsrc_activesocket(rp, sockref, so))
893 nfsrc_freecache(rp);
894 }
895 }
896 mtx_unlock(&nfsrchash_table[i].mtx);
897 }
898 j = nfsrc_tcphighwater / 5; /* 20% of it */
899 if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
900 /*
901 * Trim some more with a smaller timeout of as little
902 * as 20% of nfsrc_tcptimeout to try and get below
903 * 80% of the nfsrc_tcphighwater.
904 */
905 k = 0;
906 for (i = 0; i < 8; i++) {
907 k += time_histo[i];
908 if (k > j)
909 break;
910 }
911 k = nfsrc_tcptimeout * (i + 1) / 10;
912 if (k < 1)
913 k = 1;
914 thisstamp = tcp_lasttrim + k;
915 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
916 mtx_lock(&nfsrchash_table[i].mtx);
917 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
918 rc_hash, nextrp) {
919 if (!(rp->rc_flag &
920 (RC_INPROG|RC_LOCKED|RC_WANTED))
921 && rp->rc_refcnt == 0
922 && ((rp->rc_flag & RC_REFCNT) ||
923 thisstamp > rp->rc_timestamp ||
924 nfsrc_activesocket(rp, sockref,
925 so)))
926 nfsrc_freecache(rp);
927 }
928 mtx_unlock(&nfsrchash_table[i].mtx);
929 }
930 }
931 }
932 atomic_store_rel_int(&onethread, 0);
933 }
934
935 /*
936 * Add a seqid# reference to the cache entry.
937 */
938 APPLESTATIC void
939 nfsrvd_refcache(struct nfsrvcache *rp)
940 {
941 struct mtx *mutex;
942
943 mutex = nfsrc_cachemutex(rp);
944 mtx_lock(mutex);
945 if (rp->rc_refcnt < 0)
946 panic("nfs cache refcnt");
947 rp->rc_refcnt++;
948 mtx_unlock(mutex);
949 }
950
951 /*
952 * Dereference a seqid# cache entry.
953 */
954 APPLESTATIC void
955 nfsrvd_derefcache(struct nfsrvcache *rp)
956 {
957 struct mtx *mutex;
958
959 mutex = nfsrc_cachemutex(rp);
960 mtx_lock(mutex);
961 if (rp->rc_refcnt <= 0)
962 panic("nfs cache derefcnt");
963 rp->rc_refcnt--;
964 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
965 nfsrc_freecache(rp);
966 mtx_unlock(mutex);
967 }
968
969 /*
970 * Check to see if the socket is active.
971 * Return 1 if the reply has been received/acknowledged by the client,
972 * 0 otherwise.
973 * XXX - Uses tcp internals.
974 */
975 static int
976 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
977 struct socket *cur_so)
978 {
979 int ret = 0;
980
981 if (!(rp->rc_flag & RC_TCPSEQ))
982 return (ret);
983 /*
984 * If the sockref is the same, it is the same TCP connection.
985 */
986 if (cur_sockref == rp->rc_sockref)
987 ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
988 return (ret);
989 }
990
991 /*
992 * Calculate the length of the mbuf list and a checksum on the first up to
993 * NFSRVCACHE_CHECKLEN bytes.
994 */
995 static int
996 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
997 {
998 int len = 0, cklen;
999 mbuf_t m;
1000
1001 m = m1;
1002 while (m) {
1003 len += mbuf_len(m);
1004 m = mbuf_next(m);
1005 }
1006 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1007 *cksum = in_cksum(m1, cklen);
1008 return (len);
1009 }
1010
1011 /*
1012 * Mark a TCP connection that is seeing retries. Should never happen for
1013 * NFSv4.
1014 */
1015 static void
1016 nfsrc_marksametcpconn(u_int64_t sockref)
1017 {
1018 }
1019
Cache object: fa85e763d6f22ca7d9a3e922f0fac167
|