1 /*-
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 /*
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42 * avoided at all cost
43 * - A valid hit will probably happen a long time after the original reply
44 * and the TCP socket that the original request was received on will no
45 * longer be active
46 * (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 * in them as well as minimizing the risk of redoing retried non-idempotent
49 * Ops.
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
53 *
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
56 * For TCP
57 * - key on <xid, NFS version> (as noted above, there can be several
58 * entries with the same key)
59 * When a request arrives:
60 * For all that match key
61 * - if RPC# != OR request_size !=
62 * - not a match with this one
63 * - if NFSv4 and received on same TCP socket OR
64 * received on a TCP connection created before the
65 * entry was cached
66 * - not a match with this one
67 * (V2,3 clients might retry on same TCP socket)
68 * - calculate checksum on first N bytes of NFS XDR
69 * - if checksum !=
70 * - not a match for this one
71 * If any of the remaining ones that match has a
72 * seqid_refcnt > 0
73 * - not a match (go do RPC, using new cache entry)
74 * If one match left
75 * - a hit (reply from cache)
76 * else
77 * - miss (go do RPC, using new cache entry)
78 *
79 * During processing of NFSv4 request:
80 * - set a flag when a non-idempotent Op is processed
81 * - when an Op that uses a seqid# (Open,...) is processed
82 * - if same seqid# as referenced entry in cache
83 * - free new cache entry
84 * - reply from referenced cache entry
85 * else if next seqid# in order
86 * - free referenced cache entry
87 * - increment seqid_refcnt on new cache entry
88 * - set pointer from Openowner/Lockowner to
89 * new cache entry (aka reference it)
90 * else if first seqid# in sequence
91 * - increment seqid_refcnt on new cache entry
92 * - set pointer from Openowner/Lockowner to
93 * new cache entry (aka reference it)
94 *
95 * At end of RPC processing:
96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
97 * cache entry
98 * - save reply in cache entry
99 * - calculate checksum on first N bytes of NFS XDR
100 * request
101 * - note op and length of XDR request (in bytes)
102 * - timestamp it
103 * else
104 * - free new cache entry
105 * - Send reply (noting info for socket activity check, below)
106 *
107 * For cache entries saved above:
108 * - if saved since seqid_refcnt was > 0
109 * - free when seqid_refcnt decrements to 0
110 * (when next one in sequence is processed above, or
111 * when Openowner/Lockowner is discarded)
112 * else { non-idempotent Op(s) }
113 * - free when
114 * - some further activity observed on same
115 * socket
116 * (I'm not yet sure how I'm going to do
117 * this. Maybe look at the TCP connection
118 * to see if the send_tcp_sequence# is well
119 * past sent reply OR K additional RPCs
120 * replied on same socket OR?)
121 * OR
122 * - when very old (hours, days, weeks?)
123 *
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 * (at most one entry for each key)
127 *
128 * When a Request arrives:
129 * - if a match with entry via key
130 * - if RPC marked In_progress
131 * - discard request (don't send reply)
132 * else
133 * - reply from cache
134 * - timestamp cache entry
135 * else
136 * - add entry to cache, marked In_progress
137 * - do RPC
138 * - when RPC done
139 * - if RPC# non-idempotent
140 * - mark entry Done (not In_progress)
141 * - save reply
142 * - timestamp cache entry
143 * else
144 * - free cache entry
145 * - send reply
146 *
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 * pages 53-63. San Diego, February 1989.
152 * for the UDP case.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 * for TCP. For V3, a reply won't be saved when the flood level is
155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 * that case. This level should be set high enough that this almost
157 * never happens.
158 */
159 #include <fs/nfs/nfsport.h>
160
161 extern struct nfsstatsv1 nfsstatsv1;
162 extern struct mtx nfsrc_udpmtx;
163 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
164 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
166
167 SYSCTL_DECL(_vfs_nfsd);
168
169 static u_int nfsrc_tcphighwater = 0;
170 static int
171 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
172 {
173 int error, newhighwater;
174
175 newhighwater = nfsrc_tcphighwater;
176 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
177 if (error != 0 || req->newptr == NULL)
178 return (error);
179 if (newhighwater < 0)
180 return (EINVAL);
181 if (newhighwater >= nfsrc_floodlevel)
182 nfsrc_floodlevel = newhighwater + newhighwater / 5;
183 nfsrc_tcphighwater = newhighwater;
184 return (0);
185 }
186 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
187 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
188 "High water mark for TCP cache entries");
189
190 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
191 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
192 &nfsrc_udphighwater, 0,
193 "High water mark for UDP cache entries");
194 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
196 &nfsrc_tcptimeout, 0,
197 "Timeout for TCP entries in the DRC");
198 static u_int nfsrc_tcpnonidempotent = 1;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
200 &nfsrc_tcpnonidempotent, 0,
201 "Enable the DRC for NFS over TCP");
202
203 static int nfsrc_udpcachesize = 0;
204 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
205 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
206
207 /*
208 * and the reverse mapping from generic to Version 2 procedure numbers
209 */
210 static int newnfsv2_procid[NFS_V3NPROCS] = {
211 NFSV2PROC_NULL,
212 NFSV2PROC_GETATTR,
213 NFSV2PROC_SETATTR,
214 NFSV2PROC_LOOKUP,
215 NFSV2PROC_NOOP,
216 NFSV2PROC_READLINK,
217 NFSV2PROC_READ,
218 NFSV2PROC_WRITE,
219 NFSV2PROC_CREATE,
220 NFSV2PROC_MKDIR,
221 NFSV2PROC_SYMLINK,
222 NFSV2PROC_CREATE,
223 NFSV2PROC_REMOVE,
224 NFSV2PROC_RMDIR,
225 NFSV2PROC_RENAME,
226 NFSV2PROC_LINK,
227 NFSV2PROC_READDIR,
228 NFSV2PROC_NOOP,
229 NFSV2PROC_STATFS,
230 NFSV2PROC_NOOP,
231 NFSV2PROC_NOOP,
232 NFSV2PROC_NOOP,
233 };
234
235 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
236 #define NFSRCUDPHASH(xid) \
237 (&nfsrvudphashtbl[nfsrc_hash(xid)])
238 #define NFSRCHASH(xid) \
239 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
240 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
241 #define TRUE 1
242 #define FALSE 0
243 #define NFSRVCACHE_CHECKLEN 100
244
245 /* True iff the rpc reply is an nfs status ONLY! */
246 static int nfsv2_repstat[NFS_V3NPROCS] = {
247 FALSE,
248 FALSE,
249 FALSE,
250 FALSE,
251 FALSE,
252 FALSE,
253 FALSE,
254 FALSE,
255 FALSE,
256 FALSE,
257 TRUE,
258 TRUE,
259 TRUE,
260 TRUE,
261 FALSE,
262 TRUE,
263 FALSE,
264 FALSE,
265 FALSE,
266 FALSE,
267 FALSE,
268 FALSE,
269 };
270
271 /*
272 * Will NFS want to work over IPv6 someday?
273 */
274 #define NETFAMILY(rp) \
275 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
276
277 /* local functions */
278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
280 static void nfsrc_lock(struct nfsrvcache *rp);
281 static void nfsrc_unlock(struct nfsrvcache *rp);
282 static void nfsrc_wanted(struct nfsrvcache *rp);
283 static void nfsrc_freecache(struct nfsrvcache *rp);
284 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
285 static void nfsrc_marksametcpconn(u_int64_t);
286
287 /*
288 * Return the correct mutex for this cache entry.
289 */
290 static __inline struct mtx *
291 nfsrc_cachemutex(struct nfsrvcache *rp)
292 {
293
294 if ((rp->rc_flag & RC_UDP) != 0)
295 return (&nfsrc_udpmtx);
296 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
297 }
298
299 /*
300 * Initialize the server request cache list
301 */
302 void
303 nfsrvd_initcache(void)
304 {
305 int i;
306 static int inited = 0;
307
308 if (inited)
309 return;
310 inited = 1;
311 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
312 LIST_INIT(&nfsrvudphashtbl[i]);
313 LIST_INIT(&nfsrchash_table[i].tbl);
314 LIST_INIT(&nfsrcahash_table[i].tbl);
315 }
316 TAILQ_INIT(&nfsrvudplru);
317 nfsrc_tcpsavedreplies = 0;
318 nfsrc_udpcachesize = 0;
319 nfsstatsv1.srvcache_tcppeak = 0;
320 nfsstatsv1.srvcache_size = 0;
321 }
322
323 /*
324 * Get a cache entry for this request. Basically just malloc a new one
325 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
326 */
327 int
328 nfsrvd_getcache(struct nfsrv_descript *nd)
329 {
330 struct nfsrvcache *newrp;
331 int ret;
332
333 if (nd->nd_procnum == NFSPROC_NULL)
334 panic("nfsd cache null");
335 MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
336 M_NFSRVCACHE, M_WAITOK);
337 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
338 if (nd->nd_flag & ND_NFSV4)
339 newrp->rc_flag = RC_NFSV4;
340 else if (nd->nd_flag & ND_NFSV3)
341 newrp->rc_flag = RC_NFSV3;
342 else
343 newrp->rc_flag = RC_NFSV2;
344 newrp->rc_xid = nd->nd_retxid;
345 newrp->rc_proc = nd->nd_procnum;
346 newrp->rc_sockref = nd->nd_sockref;
347 newrp->rc_cachetime = nd->nd_tcpconntime;
348 if (nd->nd_flag & ND_SAMETCPCONN)
349 newrp->rc_flag |= RC_SAMETCPCONN;
350 if (nd->nd_nam2 != NULL) {
351 newrp->rc_flag |= RC_UDP;
352 ret = nfsrc_getudp(nd, newrp);
353 } else {
354 ret = nfsrc_gettcp(nd, newrp);
355 }
356 NFSEXITCODE2(0, nd);
357 return (ret);
358 }
359
360 /*
361 * For UDP (v2, v3):
362 * - key on <xid, NFS version, RPC#, Client host ip#>
363 * (at most one entry for each key)
364 */
365 static int
366 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
367 {
368 struct nfsrvcache *rp;
369 struct sockaddr_in *saddr;
370 struct sockaddr_in6 *saddr6;
371 struct nfsrvhashhead *hp;
372 int ret = 0;
373 struct mtx *mutex;
374
375 mutex = nfsrc_cachemutex(newrp);
376 hp = NFSRCUDPHASH(newrp->rc_xid);
377 loop:
378 mtx_lock(mutex);
379 LIST_FOREACH(rp, hp, rc_hash) {
380 if (newrp->rc_xid == rp->rc_xid &&
381 newrp->rc_proc == rp->rc_proc &&
382 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
383 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
384 if ((rp->rc_flag & RC_LOCKED) != 0) {
385 rp->rc_flag |= RC_WANTED;
386 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
387 "nfsrc", 10 * hz);
388 goto loop;
389 }
390 if (rp->rc_flag == 0)
391 panic("nfs udp cache0");
392 rp->rc_flag |= RC_LOCKED;
393 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
394 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
395 if (rp->rc_flag & RC_INPROG) {
396 nfsstatsv1.srvcache_inproghits++;
397 mtx_unlock(mutex);
398 ret = RC_DROPIT;
399 } else if (rp->rc_flag & RC_REPSTATUS) {
400 /*
401 * V2 only.
402 */
403 nfsstatsv1.srvcache_nonidemdonehits++;
404 mtx_unlock(mutex);
405 nfsrvd_rephead(nd);
406 *(nd->nd_errp) = rp->rc_status;
407 ret = RC_REPLY;
408 rp->rc_timestamp = NFSD_MONOSEC +
409 NFSRVCACHE_UDPTIMEOUT;
410 } else if (rp->rc_flag & RC_REPMBUF) {
411 nfsstatsv1.srvcache_nonidemdonehits++;
412 mtx_unlock(mutex);
413 nd->nd_mreq = m_copym(rp->rc_reply, 0,
414 M_COPYALL, M_WAITOK);
415 ret = RC_REPLY;
416 rp->rc_timestamp = NFSD_MONOSEC +
417 NFSRVCACHE_UDPTIMEOUT;
418 } else {
419 panic("nfs udp cache1");
420 }
421 nfsrc_unlock(rp);
422 free((caddr_t)newrp, M_NFSRVCACHE);
423 goto out;
424 }
425 }
426 nfsstatsv1.srvcache_misses++;
427 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
428 nfsrc_udpcachesize++;
429
430 newrp->rc_flag |= RC_INPROG;
431 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
432 if (saddr->sin_family == AF_INET)
433 newrp->rc_inet = saddr->sin_addr.s_addr;
434 else if (saddr->sin_family == AF_INET6) {
435 saddr6 = (struct sockaddr_in6 *)saddr;
436 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
437 sizeof (struct in6_addr));
438 newrp->rc_flag |= RC_INETIPV6;
439 }
440 LIST_INSERT_HEAD(hp, newrp, rc_hash);
441 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
442 mtx_unlock(mutex);
443 nd->nd_rp = newrp;
444 ret = RC_DOIT;
445
446 out:
447 NFSEXITCODE2(0, nd);
448 return (ret);
449 }
450
451 /*
452 * Update a request cache entry after the rpc has been done
453 */
454 struct nfsrvcache *
455 nfsrvd_updatecache(struct nfsrv_descript *nd)
456 {
457 struct nfsrvcache *rp;
458 struct nfsrvcache *retrp = NULL;
459 mbuf_t m;
460 struct mtx *mutex;
461
462 rp = nd->nd_rp;
463 if (!rp)
464 panic("nfsrvd_updatecache null rp");
465 nd->nd_rp = NULL;
466 mutex = nfsrc_cachemutex(rp);
467 mtx_lock(mutex);
468 nfsrc_lock(rp);
469 if (!(rp->rc_flag & RC_INPROG))
470 panic("nfsrvd_updatecache not inprog");
471 rp->rc_flag &= ~RC_INPROG;
472 if (rp->rc_flag & RC_UDP) {
473 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
474 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
475 }
476
477 /*
478 * Reply from cache is a special case returned by nfsrv_checkseqid().
479 */
480 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
481 nfsstatsv1.srvcache_nonidemdonehits++;
482 mtx_unlock(mutex);
483 nd->nd_repstat = 0;
484 if (nd->nd_mreq)
485 mbuf_freem(nd->nd_mreq);
486 if (!(rp->rc_flag & RC_REPMBUF))
487 panic("reply from cache");
488 nd->nd_mreq = m_copym(rp->rc_reply, 0,
489 M_COPYALL, M_WAITOK);
490 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
491 nfsrc_unlock(rp);
492 goto out;
493 }
494
495 /*
496 * If rc_refcnt > 0, save it
497 * For UDP, save it if ND_SAVEREPLY is set
498 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
499 */
500 if (nd->nd_repstat != NFSERR_DONTREPLY &&
501 (rp->rc_refcnt > 0 ||
502 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
503 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
504 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
505 nfsrc_tcpnonidempotent))) {
506 if (rp->rc_refcnt > 0) {
507 if (!(rp->rc_flag & RC_NFSV4))
508 panic("update_cache refcnt");
509 rp->rc_flag |= RC_REFCNT;
510 }
511 if ((nd->nd_flag & ND_NFSV2) &&
512 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
513 rp->rc_status = nd->nd_repstat;
514 rp->rc_flag |= RC_REPSTATUS;
515 mtx_unlock(mutex);
516 } else {
517 if (!(rp->rc_flag & RC_UDP)) {
518 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
519 if (nfsrc_tcpsavedreplies >
520 nfsstatsv1.srvcache_tcppeak)
521 nfsstatsv1.srvcache_tcppeak =
522 nfsrc_tcpsavedreplies;
523 }
524 mtx_unlock(mutex);
525 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
526 mtx_lock(mutex);
527 rp->rc_reply = m;
528 rp->rc_flag |= RC_REPMBUF;
529 mtx_unlock(mutex);
530 }
531 if (rp->rc_flag & RC_UDP) {
532 rp->rc_timestamp = NFSD_MONOSEC +
533 NFSRVCACHE_UDPTIMEOUT;
534 nfsrc_unlock(rp);
535 } else {
536 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
537 if (rp->rc_refcnt > 0)
538 nfsrc_unlock(rp);
539 else
540 retrp = rp;
541 }
542 } else {
543 nfsrc_freecache(rp);
544 mtx_unlock(mutex);
545 }
546
547 out:
548 NFSEXITCODE2(0, nd);
549 return (retrp);
550 }
551
552 /*
553 * Invalidate and, if possible, free an in prog cache entry.
554 * Must not sleep.
555 */
556 void
557 nfsrvd_delcache(struct nfsrvcache *rp)
558 {
559 struct mtx *mutex;
560
561 mutex = nfsrc_cachemutex(rp);
562 if (!(rp->rc_flag & RC_INPROG))
563 panic("nfsrvd_delcache not in prog");
564 mtx_lock(mutex);
565 rp->rc_flag &= ~RC_INPROG;
566 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
567 nfsrc_freecache(rp);
568 mtx_unlock(mutex);
569 }
570
571 /*
572 * Called after nfsrvd_updatecache() once the reply is sent, to update
573 * the entry's sequence number and unlock it. The argument is
574 * the pointer returned by nfsrvd_updatecache().
575 */
576 void
577 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
578 {
579 struct nfsrchash_bucket *hbp;
580
581 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
582 if (have_seq) {
583 hbp = NFSRCAHASH(rp->rc_sockref);
584 mtx_lock(&hbp->mtx);
585 rp->rc_tcpseq = seq;
586 if (rp->rc_acked != RC_NO_ACK)
587 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
588 rp->rc_acked = RC_NO_ACK;
589 mtx_unlock(&hbp->mtx);
590 }
591 nfsrc_unlock(rp);
592 }
593
594 /*
595 * Get a cache entry for TCP
596 * - key on <xid, nfs version>
597 * (allow multiple entries for a given key)
598 */
599 static int
600 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
601 {
602 struct nfsrvcache *rp, *nextrp;
603 int i;
604 struct nfsrvcache *hitrp;
605 struct nfsrvhashhead *hp, nfsrc_templist;
606 int hit, ret = 0;
607 struct mtx *mutex;
608
609 mutex = nfsrc_cachemutex(newrp);
610 hp = NFSRCHASH(newrp->rc_xid);
611 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
612 tryagain:
613 mtx_lock(mutex);
614 hit = 1;
615 LIST_INIT(&nfsrc_templist);
616 /*
617 * Get all the matches and put them on the temp list.
618 */
619 rp = LIST_FIRST(hp);
620 while (rp != LIST_END(hp)) {
621 nextrp = LIST_NEXT(rp, rc_hash);
622 if (newrp->rc_xid == rp->rc_xid &&
623 (!(rp->rc_flag & RC_INPROG) ||
624 ((newrp->rc_flag & RC_SAMETCPCONN) &&
625 newrp->rc_sockref == rp->rc_sockref)) &&
626 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
627 newrp->rc_proc == rp->rc_proc &&
628 ((newrp->rc_flag & RC_NFSV4) &&
629 newrp->rc_sockref != rp->rc_sockref &&
630 newrp->rc_cachetime >= rp->rc_cachetime)
631 && newrp->rc_reqlen == rp->rc_reqlen &&
632 newrp->rc_cksum == rp->rc_cksum) {
633 LIST_REMOVE(rp, rc_hash);
634 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
635 }
636 rp = nextrp;
637 }
638
639 /*
640 * Now, use nfsrc_templist to decide if there is a match.
641 */
642 i = 0;
643 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
644 i++;
645 if (rp->rc_refcnt > 0) {
646 hit = 0;
647 break;
648 }
649 }
650 /*
651 * Can be a hit only if one entry left.
652 * Note possible hit entry and put nfsrc_templist back on hash
653 * list.
654 */
655 if (i != 1)
656 hit = 0;
657 hitrp = rp = LIST_FIRST(&nfsrc_templist);
658 while (rp != LIST_END(&nfsrc_templist)) {
659 nextrp = LIST_NEXT(rp, rc_hash);
660 LIST_REMOVE(rp, rc_hash);
661 LIST_INSERT_HEAD(hp, rp, rc_hash);
662 rp = nextrp;
663 }
664 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
665 panic("nfs gettcp cache templist");
666
667 if (hit) {
668 rp = hitrp;
669 if ((rp->rc_flag & RC_LOCKED) != 0) {
670 rp->rc_flag |= RC_WANTED;
671 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
672 "nfsrc", 10 * hz);
673 goto tryagain;
674 }
675 if (rp->rc_flag == 0)
676 panic("nfs tcp cache0");
677 rp->rc_flag |= RC_LOCKED;
678 if (rp->rc_flag & RC_INPROG) {
679 nfsstatsv1.srvcache_inproghits++;
680 mtx_unlock(mutex);
681 if (newrp->rc_sockref == rp->rc_sockref)
682 nfsrc_marksametcpconn(rp->rc_sockref);
683 ret = RC_DROPIT;
684 } else if (rp->rc_flag & RC_REPSTATUS) {
685 /*
686 * V2 only.
687 */
688 nfsstatsv1.srvcache_nonidemdonehits++;
689 mtx_unlock(mutex);
690 if (newrp->rc_sockref == rp->rc_sockref)
691 nfsrc_marksametcpconn(rp->rc_sockref);
692 ret = RC_REPLY;
693 nfsrvd_rephead(nd);
694 *(nd->nd_errp) = rp->rc_status;
695 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
696 } else if (rp->rc_flag & RC_REPMBUF) {
697 nfsstatsv1.srvcache_nonidemdonehits++;
698 mtx_unlock(mutex);
699 if (newrp->rc_sockref == rp->rc_sockref)
700 nfsrc_marksametcpconn(rp->rc_sockref);
701 ret = RC_REPLY;
702 nd->nd_mreq = m_copym(rp->rc_reply, 0,
703 M_COPYALL, M_WAITOK);
704 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
705 } else {
706 panic("nfs tcp cache1");
707 }
708 nfsrc_unlock(rp);
709 free((caddr_t)newrp, M_NFSRVCACHE);
710 goto out;
711 }
712 nfsstatsv1.srvcache_misses++;
713 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
714
715 /*
716 * For TCP, multiple entries for a key are allowed, so don't
717 * chain it into the hash table until done.
718 */
719 newrp->rc_cachetime = NFSD_MONOSEC;
720 newrp->rc_flag |= RC_INPROG;
721 LIST_INSERT_HEAD(hp, newrp, rc_hash);
722 mtx_unlock(mutex);
723 nd->nd_rp = newrp;
724 ret = RC_DOIT;
725
726 out:
727 NFSEXITCODE2(0, nd);
728 return (ret);
729 }
730
731 /*
732 * Lock a cache entry.
733 */
734 static void
735 nfsrc_lock(struct nfsrvcache *rp)
736 {
737 struct mtx *mutex;
738
739 mutex = nfsrc_cachemutex(rp);
740 mtx_assert(mutex, MA_OWNED);
741 while ((rp->rc_flag & RC_LOCKED) != 0) {
742 rp->rc_flag |= RC_WANTED;
743 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
744 }
745 rp->rc_flag |= RC_LOCKED;
746 }
747
748 /*
749 * Unlock a cache entry.
750 */
751 static void
752 nfsrc_unlock(struct nfsrvcache *rp)
753 {
754 struct mtx *mutex;
755
756 mutex = nfsrc_cachemutex(rp);
757 mtx_lock(mutex);
758 rp->rc_flag &= ~RC_LOCKED;
759 nfsrc_wanted(rp);
760 mtx_unlock(mutex);
761 }
762
763 /*
764 * Wakeup anyone wanting entry.
765 */
766 static void
767 nfsrc_wanted(struct nfsrvcache *rp)
768 {
769 if (rp->rc_flag & RC_WANTED) {
770 rp->rc_flag &= ~RC_WANTED;
771 wakeup((caddr_t)rp);
772 }
773 }
774
775 /*
776 * Free up the entry.
777 * Must not sleep.
778 */
779 static void
780 nfsrc_freecache(struct nfsrvcache *rp)
781 {
782 struct nfsrchash_bucket *hbp;
783
784 LIST_REMOVE(rp, rc_hash);
785 if (rp->rc_flag & RC_UDP) {
786 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
787 nfsrc_udpcachesize--;
788 } else if (rp->rc_acked != RC_NO_SEQ) {
789 hbp = NFSRCAHASH(rp->rc_sockref);
790 mtx_lock(&hbp->mtx);
791 if (rp->rc_acked == RC_NO_ACK)
792 LIST_REMOVE(rp, rc_ahash);
793 mtx_unlock(&hbp->mtx);
794 }
795 nfsrc_wanted(rp);
796 if (rp->rc_flag & RC_REPMBUF) {
797 mbuf_freem(rp->rc_reply);
798 if (!(rp->rc_flag & RC_UDP))
799 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
800 }
801 FREE((caddr_t)rp, M_NFSRVCACHE);
802 atomic_add_int(&nfsstatsv1.srvcache_size, -1);
803 }
804
805 /*
806 * Clean out the cache. Called when nfsserver module is unloaded.
807 */
808 void
809 nfsrvd_cleancache(void)
810 {
811 struct nfsrvcache *rp, *nextrp;
812 int i;
813
814 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
815 mtx_lock(&nfsrchash_table[i].mtx);
816 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
817 nfsrc_freecache(rp);
818 mtx_unlock(&nfsrchash_table[i].mtx);
819 }
820 mtx_lock(&nfsrc_udpmtx);
821 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
822 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
823 nfsrc_freecache(rp);
824 }
825 }
826 nfsstatsv1.srvcache_size = 0;
827 mtx_unlock(&nfsrc_udpmtx);
828 nfsrc_tcpsavedreplies = 0;
829 }
830
831 #define HISTSIZE 16
832 /*
833 * The basic rule is to get rid of entries that are expired.
834 */
835 void
836 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
837 {
838 struct nfsrchash_bucket *hbp;
839 struct nfsrvcache *rp, *nextrp;
840 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
841 time_t thisstamp;
842 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
843 static int onethread = 0, oneslot = 0;
844
845 if (sockref != 0) {
846 hbp = NFSRCAHASH(sockref);
847 mtx_lock(&hbp->mtx);
848 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
849 if (sockref == rp->rc_sockref) {
850 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
851 rp->rc_acked = RC_ACK;
852 LIST_REMOVE(rp, rc_ahash);
853 } else if (final) {
854 rp->rc_acked = RC_NACK;
855 LIST_REMOVE(rp, rc_ahash);
856 }
857 }
858 }
859 mtx_unlock(&hbp->mtx);
860 }
861
862 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
863 return;
864 if (NFSD_MONOSEC != udp_lasttrim ||
865 nfsrc_udpcachesize >= (nfsrc_udphighwater +
866 nfsrc_udphighwater / 2)) {
867 mtx_lock(&nfsrc_udpmtx);
868 udp_lasttrim = NFSD_MONOSEC;
869 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
870 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
871 && rp->rc_refcnt == 0
872 && ((rp->rc_flag & RC_REFCNT) ||
873 udp_lasttrim > rp->rc_timestamp ||
874 nfsrc_udpcachesize > nfsrc_udphighwater))
875 nfsrc_freecache(rp);
876 }
877 mtx_unlock(&nfsrc_udpmtx);
878 }
879 if (NFSD_MONOSEC != tcp_lasttrim ||
880 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
881 force = nfsrc_tcphighwater / 4;
882 if (force > 0 &&
883 nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
884 for (i = 0; i < HISTSIZE; i++)
885 time_histo[i] = 0;
886 i = 0;
887 lastslot = NFSRVCACHE_HASHSIZE - 1;
888 } else {
889 force = 0;
890 if (NFSD_MONOSEC != tcp_lasttrim) {
891 i = 0;
892 lastslot = NFSRVCACHE_HASHSIZE - 1;
893 } else {
894 lastslot = i = oneslot;
895 if (++oneslot >= NFSRVCACHE_HASHSIZE)
896 oneslot = 0;
897 }
898 }
899 tto = nfsrc_tcptimeout;
900 tcp_lasttrim = NFSD_MONOSEC;
901 for (; i <= lastslot; i++) {
902 mtx_lock(&nfsrchash_table[i].mtx);
903 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
904 nextrp) {
905 if (!(rp->rc_flag &
906 (RC_INPROG|RC_LOCKED|RC_WANTED))
907 && rp->rc_refcnt == 0) {
908 if ((rp->rc_flag & RC_REFCNT) ||
909 tcp_lasttrim > rp->rc_timestamp ||
910 rp->rc_acked == RC_ACK) {
911 nfsrc_freecache(rp);
912 continue;
913 }
914
915 if (force == 0)
916 continue;
917 /*
918 * The timestamps range from roughly the
919 * present (tcp_lasttrim) to the present
920 * + nfsrc_tcptimeout. Generate a simple
921 * histogram of where the timeouts fall.
922 */
923 j = rp->rc_timestamp - tcp_lasttrim;
924 if (j >= tto)
925 j = HISTSIZE - 1;
926 else if (j < 0)
927 j = 0;
928 else
929 j = j * HISTSIZE / tto;
930 time_histo[j]++;
931 }
932 }
933 mtx_unlock(&nfsrchash_table[i].mtx);
934 }
935 if (force) {
936 /*
937 * Trim some more with a smaller timeout of as little
938 * as 20% of nfsrc_tcptimeout to try and get below
939 * 80% of the nfsrc_tcphighwater.
940 */
941 k = 0;
942 for (i = 0; i < (HISTSIZE - 2); i++) {
943 k += time_histo[i];
944 if (k > force)
945 break;
946 }
947 k = tto * (i + 1) / HISTSIZE;
948 if (k < 1)
949 k = 1;
950 thisstamp = tcp_lasttrim + k;
951 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
952 mtx_lock(&nfsrchash_table[i].mtx);
953 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
954 rc_hash, nextrp) {
955 if (!(rp->rc_flag &
956 (RC_INPROG|RC_LOCKED|RC_WANTED))
957 && rp->rc_refcnt == 0
958 && ((rp->rc_flag & RC_REFCNT) ||
959 thisstamp > rp->rc_timestamp ||
960 rp->rc_acked == RC_ACK))
961 nfsrc_freecache(rp);
962 }
963 mtx_unlock(&nfsrchash_table[i].mtx);
964 }
965 }
966 }
967 atomic_store_rel_int(&onethread, 0);
968 }
969
970 /*
971 * Add a seqid# reference to the cache entry.
972 */
973 void
974 nfsrvd_refcache(struct nfsrvcache *rp)
975 {
976 struct mtx *mutex;
977
978 if (rp == NULL)
979 /* For NFSv4.1, there is no cache entry. */
980 return;
981 mutex = nfsrc_cachemutex(rp);
982 mtx_lock(mutex);
983 if (rp->rc_refcnt < 0)
984 panic("nfs cache refcnt");
985 rp->rc_refcnt++;
986 mtx_unlock(mutex);
987 }
988
989 /*
990 * Dereference a seqid# cache entry.
991 */
992 void
993 nfsrvd_derefcache(struct nfsrvcache *rp)
994 {
995 struct mtx *mutex;
996
997 mutex = nfsrc_cachemutex(rp);
998 mtx_lock(mutex);
999 if (rp->rc_refcnt <= 0)
1000 panic("nfs cache derefcnt");
1001 rp->rc_refcnt--;
1002 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1003 nfsrc_freecache(rp);
1004 mtx_unlock(mutex);
1005 }
1006
1007 /*
1008 * Calculate the length of the mbuf list and a checksum on the first up to
1009 * NFSRVCACHE_CHECKLEN bytes.
1010 */
1011 static int
1012 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1013 {
1014 int len = 0, cklen;
1015 mbuf_t m;
1016
1017 m = m1;
1018 while (m) {
1019 len += mbuf_len(m);
1020 m = mbuf_next(m);
1021 }
1022 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1023 *cksum = in_cksum(m1, cklen);
1024 return (len);
1025 }
1026
1027 /*
1028 * Mark a TCP connection that is seeing retries. Should never happen for
1029 * NFSv4.
1030 */
1031 static void
1032 nfsrc_marksametcpconn(u_int64_t sockref)
1033 {
1034 }
1035
Cache object: d88c7d8ff5ae371f6238e9a880f1d1af
|