1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 /*
40 * Here is the basic algorithm:
41 * First, some design criteria I used:
42 * - I think a false hit is more serious than a false miss
43 * - A false hit for an RPC that has Op(s) that order via seqid# must be
44 * avoided at all cost
45 * - A valid hit will probably happen a long time after the original reply
46 * and the TCP socket that the original request was received on will no
47 * longer be active
48 * (The long time delay implies to me that LRU is not appropriate.)
49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50 * in them as well as minimizing the risk of redoing retried non-idempotent
51 * Ops.
52 * Because it is biased towards avoiding false hits, multiple entries with
53 * the same xid are to be expected, especially for the case of the entry
54 * in the cache being related to a seqid# sequenced Op.
55 *
56 * The basic algorithm I'm about to code up:
57 * - Null RPCs bypass the cache and are just done
58 * For TCP
59 * - key on <xid, NFS version> (as noted above, there can be several
60 * entries with the same key)
61 * When a request arrives:
62 * For all that match key
63 * - if RPC# != OR request_size !=
64 * - not a match with this one
65 * - if NFSv4 and received on same TCP socket OR
66 * received on a TCP connection created before the
67 * entry was cached
68 * - not a match with this one
69 * (V2,3 clients might retry on same TCP socket)
70 * - calculate checksum on first N bytes of NFS XDR
71 * - if checksum !=
72 * - not a match for this one
73 * If any of the remaining ones that match has a
74 * seqid_refcnt > 0
75 * - not a match (go do RPC, using new cache entry)
76 * If one match left
77 * - a hit (reply from cache)
78 * else
79 * - miss (go do RPC, using new cache entry)
80 *
81 * During processing of NFSv4 request:
82 * - set a flag when a non-idempotent Op is processed
83 * - when an Op that uses a seqid# (Open,...) is processed
84 * - if same seqid# as referenced entry in cache
85 * - free new cache entry
86 * - reply from referenced cache entry
87 * else if next seqid# in order
88 * - free referenced cache entry
89 * - increment seqid_refcnt on new cache entry
90 * - set pointer from Openowner/Lockowner to
91 * new cache entry (aka reference it)
92 * else if first seqid# in sequence
93 * - increment seqid_refcnt on new cache entry
94 * - set pointer from Openowner/Lockowner to
95 * new cache entry (aka reference it)
96 *
97 * At end of RPC processing:
98 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
99 * cache entry
100 * - save reply in cache entry
101 * - calculate checksum on first N bytes of NFS XDR
102 * request
103 * - note op and length of XDR request (in bytes)
104 * - timestamp it
105 * else
106 * - free new cache entry
107 * - Send reply (noting info for socket activity check, below)
108 *
109 * For cache entries saved above:
110 * - if saved since seqid_refcnt was > 0
111 * - free when seqid_refcnt decrements to 0
112 * (when next one in sequence is processed above, or
113 * when Openowner/Lockowner is discarded)
114 * else { non-idempotent Op(s) }
115 * - free when
116 * - some further activity observed on same
117 * socket
118 * (I'm not yet sure how I'm going to do
119 * this. Maybe look at the TCP connection
120 * to see if the send_tcp_sequence# is well
121 * past sent reply OR K additional RPCs
122 * replied on same socket OR?)
123 * OR
124 * - when very old (hours, days, weeks?)
125 *
126 * For UDP (v2, 3 only), pretty much the old way:
127 * - key on <xid, NFS version, RPC#, Client host ip#>
128 * (at most one entry for each key)
129 *
130 * When a Request arrives:
131 * - if a match with entry via key
132 * - if RPC marked In_progress
133 * - discard request (don't send reply)
134 * else
135 * - reply from cache
136 * - timestamp cache entry
137 * else
138 * - add entry to cache, marked In_progress
139 * - do RPC
140 * - when RPC done
141 * - if RPC# non-idempotent
142 * - mark entry Done (not In_progress)
143 * - save reply
144 * - timestamp cache entry
145 * else
146 * - free cache entry
147 * - send reply
148 *
149 * Later, entries with saved replies are free'd a short time (few minutes)
150 * after reply sent (timestamp).
151 * Reference: Chet Juszczak, "Improving the Performance and Correctness
152 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153 * pages 53-63. San Diego, February 1989.
154 * for the UDP case.
155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156 * for TCP. For V3, a reply won't be saved when the flood level is
157 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158 * that case. This level should be set high enough that this almost
159 * never happens.
160 */
161 #include <fs/nfs/nfsport.h>
162
163 extern struct nfsstatsv1 nfsstatsv1;
164 extern struct mtx nfsrc_udpmtx;
165 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
166 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
167 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
168
169 SYSCTL_DECL(_vfs_nfsd);
170
171 static u_int nfsrc_tcphighwater = 0;
172 static int
173 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
174 {
175 int error, newhighwater;
176
177 newhighwater = nfsrc_tcphighwater;
178 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
179 if (error != 0 || req->newptr == NULL)
180 return (error);
181 if (newhighwater < 0)
182 return (EINVAL);
183 if (newhighwater >= nfsrc_floodlevel)
184 nfsrc_floodlevel = newhighwater + newhighwater / 5;
185 nfsrc_tcphighwater = newhighwater;
186 return (0);
187 }
188 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
189 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
190 sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
191
192 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
193 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
194 &nfsrc_udphighwater, 0,
195 "High water mark for UDP cache entries");
196 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
198 &nfsrc_tcptimeout, 0,
199 "Timeout for TCP entries in the DRC");
200 static u_int nfsrc_tcpnonidempotent = 1;
201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
202 &nfsrc_tcpnonidempotent, 0,
203 "Enable the DRC for NFS over TCP");
204
205 static int nfsrc_udpcachesize = 0;
206 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
207 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
208
209 /*
210 * and the reverse mapping from generic to Version 2 procedure numbers
211 */
212 static int newnfsv2_procid[NFS_V3NPROCS] = {
213 NFSV2PROC_NULL,
214 NFSV2PROC_GETATTR,
215 NFSV2PROC_SETATTR,
216 NFSV2PROC_LOOKUP,
217 NFSV2PROC_NOOP,
218 NFSV2PROC_READLINK,
219 NFSV2PROC_READ,
220 NFSV2PROC_WRITE,
221 NFSV2PROC_CREATE,
222 NFSV2PROC_MKDIR,
223 NFSV2PROC_SYMLINK,
224 NFSV2PROC_CREATE,
225 NFSV2PROC_REMOVE,
226 NFSV2PROC_RMDIR,
227 NFSV2PROC_RENAME,
228 NFSV2PROC_LINK,
229 NFSV2PROC_READDIR,
230 NFSV2PROC_NOOP,
231 NFSV2PROC_STATFS,
232 NFSV2PROC_NOOP,
233 NFSV2PROC_NOOP,
234 NFSV2PROC_NOOP,
235 };
236
237 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
238 #define NFSRCUDPHASH(xid) \
239 (&nfsrvudphashtbl[nfsrc_hash(xid)])
240 #define NFSRCHASH(xid) \
241 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
242 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
243 #define TRUE 1
244 #define FALSE 0
245 #define NFSRVCACHE_CHECKLEN 100
246
247 /* True iff the rpc reply is an nfs status ONLY! */
248 static int nfsv2_repstat[NFS_V3NPROCS] = {
249 FALSE,
250 FALSE,
251 FALSE,
252 FALSE,
253 FALSE,
254 FALSE,
255 FALSE,
256 FALSE,
257 FALSE,
258 FALSE,
259 TRUE,
260 TRUE,
261 TRUE,
262 TRUE,
263 FALSE,
264 TRUE,
265 FALSE,
266 FALSE,
267 FALSE,
268 FALSE,
269 FALSE,
270 FALSE,
271 };
272
273 /*
274 * Will NFS want to work over IPv6 someday?
275 */
276 #define NETFAMILY(rp) \
277 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278
279 /* local functions */
280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static void nfsrc_lock(struct nfsrvcache *rp);
283 static void nfsrc_unlock(struct nfsrvcache *rp);
284 static void nfsrc_wanted(struct nfsrvcache *rp);
285 static void nfsrc_freecache(struct nfsrvcache *rp);
286 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
287 static void nfsrc_marksametcpconn(u_int64_t);
288
289 /*
290 * Return the correct mutex for this cache entry.
291 */
292 static __inline struct mtx *
293 nfsrc_cachemutex(struct nfsrvcache *rp)
294 {
295
296 if ((rp->rc_flag & RC_UDP) != 0)
297 return (&nfsrc_udpmtx);
298 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
299 }
300
301 /*
302 * Initialize the server request cache list
303 */
304 void
305 nfsrvd_initcache(void)
306 {
307 int i;
308 static int inited = 0;
309
310 if (inited)
311 return;
312 inited = 1;
313 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
314 LIST_INIT(&nfsrvudphashtbl[i]);
315 LIST_INIT(&nfsrchash_table[i].tbl);
316 LIST_INIT(&nfsrcahash_table[i].tbl);
317 }
318 TAILQ_INIT(&nfsrvudplru);
319 nfsrc_tcpsavedreplies = 0;
320 nfsrc_udpcachesize = 0;
321 nfsstatsv1.srvcache_tcppeak = 0;
322 nfsstatsv1.srvcache_size = 0;
323 }
324
325 /*
326 * Get a cache entry for this request. Basically just malloc a new one
327 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328 */
329 int
330 nfsrvd_getcache(struct nfsrv_descript *nd)
331 {
332 struct nfsrvcache *newrp;
333 int ret;
334
335 if (nd->nd_procnum == NFSPROC_NULL)
336 panic("nfsd cache null");
337 newrp = malloc(sizeof (struct nfsrvcache),
338 M_NFSRVCACHE, M_WAITOK);
339 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
340 if (nd->nd_flag & ND_NFSV4)
341 newrp->rc_flag = RC_NFSV4;
342 else if (nd->nd_flag & ND_NFSV3)
343 newrp->rc_flag = RC_NFSV3;
344 else
345 newrp->rc_flag = RC_NFSV2;
346 newrp->rc_xid = nd->nd_retxid;
347 newrp->rc_proc = nd->nd_procnum;
348 newrp->rc_sockref = nd->nd_sockref;
349 newrp->rc_cachetime = nd->nd_tcpconntime;
350 if (nd->nd_flag & ND_SAMETCPCONN)
351 newrp->rc_flag |= RC_SAMETCPCONN;
352 if (nd->nd_nam2 != NULL) {
353 newrp->rc_flag |= RC_UDP;
354 ret = nfsrc_getudp(nd, newrp);
355 } else {
356 ret = nfsrc_gettcp(nd, newrp);
357 }
358 NFSEXITCODE2(0, nd);
359 return (ret);
360 }
361
362 /*
363 * For UDP (v2, v3):
364 * - key on <xid, NFS version, RPC#, Client host ip#>
365 * (at most one entry for each key)
366 */
367 static int
368 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
369 {
370 struct nfsrvcache *rp;
371 struct sockaddr_in *saddr;
372 struct sockaddr_in6 *saddr6;
373 struct nfsrvhashhead *hp;
374 int ret = 0;
375 struct mtx *mutex;
376
377 mutex = nfsrc_cachemutex(newrp);
378 hp = NFSRCUDPHASH(newrp->rc_xid);
379 loop:
380 mtx_lock(mutex);
381 LIST_FOREACH(rp, hp, rc_hash) {
382 if (newrp->rc_xid == rp->rc_xid &&
383 newrp->rc_proc == rp->rc_proc &&
384 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
385 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
386 if ((rp->rc_flag & RC_LOCKED) != 0) {
387 rp->rc_flag |= RC_WANTED;
388 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
389 "nfsrc", 10 * hz);
390 goto loop;
391 }
392 if (rp->rc_flag == 0)
393 panic("nfs udp cache0");
394 rp->rc_flag |= RC_LOCKED;
395 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
396 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
397 if (rp->rc_flag & RC_INPROG) {
398 nfsstatsv1.srvcache_inproghits++;
399 mtx_unlock(mutex);
400 ret = RC_DROPIT;
401 } else if (rp->rc_flag & RC_REPSTATUS) {
402 /*
403 * V2 only.
404 */
405 nfsstatsv1.srvcache_nonidemdonehits++;
406 mtx_unlock(mutex);
407 nfsrvd_rephead(nd);
408 *(nd->nd_errp) = rp->rc_status;
409 ret = RC_REPLY;
410 rp->rc_timestamp = NFSD_MONOSEC +
411 NFSRVCACHE_UDPTIMEOUT;
412 } else if (rp->rc_flag & RC_REPMBUF) {
413 nfsstatsv1.srvcache_nonidemdonehits++;
414 mtx_unlock(mutex);
415 nd->nd_mreq = m_copym(rp->rc_reply, 0,
416 M_COPYALL, M_WAITOK);
417 ret = RC_REPLY;
418 rp->rc_timestamp = NFSD_MONOSEC +
419 NFSRVCACHE_UDPTIMEOUT;
420 } else {
421 panic("nfs udp cache1");
422 }
423 nfsrc_unlock(rp);
424 free(newrp, M_NFSRVCACHE);
425 goto out;
426 }
427 }
428 nfsstatsv1.srvcache_misses++;
429 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
430 nfsrc_udpcachesize++;
431
432 newrp->rc_flag |= RC_INPROG;
433 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
434 if (saddr->sin_family == AF_INET)
435 newrp->rc_inet = saddr->sin_addr.s_addr;
436 else if (saddr->sin_family == AF_INET6) {
437 saddr6 = (struct sockaddr_in6 *)saddr;
438 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
439 sizeof (struct in6_addr));
440 newrp->rc_flag |= RC_INETIPV6;
441 }
442 LIST_INSERT_HEAD(hp, newrp, rc_hash);
443 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
444 mtx_unlock(mutex);
445 nd->nd_rp = newrp;
446 ret = RC_DOIT;
447
448 out:
449 NFSEXITCODE2(0, nd);
450 return (ret);
451 }
452
453 /*
454 * Update a request cache entry after the rpc has been done
455 */
456 struct nfsrvcache *
457 nfsrvd_updatecache(struct nfsrv_descript *nd)
458 {
459 struct nfsrvcache *rp;
460 struct nfsrvcache *retrp = NULL;
461 struct mbuf *m;
462 struct mtx *mutex;
463
464 rp = nd->nd_rp;
465 if (!rp)
466 panic("nfsrvd_updatecache null rp");
467 nd->nd_rp = NULL;
468 mutex = nfsrc_cachemutex(rp);
469 mtx_lock(mutex);
470 nfsrc_lock(rp);
471 if (!(rp->rc_flag & RC_INPROG))
472 panic("nfsrvd_updatecache not inprog");
473 rp->rc_flag &= ~RC_INPROG;
474 if (rp->rc_flag & RC_UDP) {
475 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
476 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
477 }
478
479 /*
480 * Reply from cache is a special case returned by nfsrv_checkseqid().
481 */
482 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
483 nfsstatsv1.srvcache_nonidemdonehits++;
484 mtx_unlock(mutex);
485 nd->nd_repstat = 0;
486 if (nd->nd_mreq)
487 m_freem(nd->nd_mreq);
488 if (!(rp->rc_flag & RC_REPMBUF))
489 panic("reply from cache");
490 nd->nd_mreq = m_copym(rp->rc_reply, 0,
491 M_COPYALL, M_WAITOK);
492 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
493 nfsrc_unlock(rp);
494 goto out;
495 }
496
497 /*
498 * If rc_refcnt > 0, save it
499 * For UDP, save it if ND_SAVEREPLY is set
500 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
501 */
502 if (nd->nd_repstat != NFSERR_DONTREPLY &&
503 (rp->rc_refcnt > 0 ||
504 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
505 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
506 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
507 nfsrc_tcpnonidempotent))) {
508 if (rp->rc_refcnt > 0) {
509 if (!(rp->rc_flag & RC_NFSV4))
510 panic("update_cache refcnt");
511 rp->rc_flag |= RC_REFCNT;
512 }
513 if ((nd->nd_flag & ND_NFSV2) &&
514 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
515 rp->rc_status = nd->nd_repstat;
516 rp->rc_flag |= RC_REPSTATUS;
517 mtx_unlock(mutex);
518 } else {
519 if (!(rp->rc_flag & RC_UDP)) {
520 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
521 if (nfsrc_tcpsavedreplies >
522 nfsstatsv1.srvcache_tcppeak)
523 nfsstatsv1.srvcache_tcppeak =
524 nfsrc_tcpsavedreplies;
525 }
526 mtx_unlock(mutex);
527 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
528 mtx_lock(mutex);
529 rp->rc_reply = m;
530 rp->rc_flag |= RC_REPMBUF;
531 mtx_unlock(mutex);
532 }
533 if (rp->rc_flag & RC_UDP) {
534 rp->rc_timestamp = NFSD_MONOSEC +
535 NFSRVCACHE_UDPTIMEOUT;
536 nfsrc_unlock(rp);
537 } else {
538 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
539 if (rp->rc_refcnt > 0)
540 nfsrc_unlock(rp);
541 else
542 retrp = rp;
543 }
544 } else {
545 nfsrc_freecache(rp);
546 mtx_unlock(mutex);
547 }
548
549 out:
550 NFSEXITCODE2(0, nd);
551 return (retrp);
552 }
553
554 /*
555 * Invalidate and, if possible, free an in prog cache entry.
556 * Must not sleep.
557 */
558 void
559 nfsrvd_delcache(struct nfsrvcache *rp)
560 {
561 struct mtx *mutex;
562
563 mutex = nfsrc_cachemutex(rp);
564 if (!(rp->rc_flag & RC_INPROG))
565 panic("nfsrvd_delcache not in prog");
566 mtx_lock(mutex);
567 rp->rc_flag &= ~RC_INPROG;
568 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
569 nfsrc_freecache(rp);
570 mtx_unlock(mutex);
571 }
572
573 /*
574 * Called after nfsrvd_updatecache() once the reply is sent, to update
575 * the entry's sequence number and unlock it. The argument is
576 * the pointer returned by nfsrvd_updatecache().
577 */
578 void
579 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
580 {
581 struct nfsrchash_bucket *hbp;
582
583 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
584 if (have_seq) {
585 hbp = NFSRCAHASH(rp->rc_sockref);
586 mtx_lock(&hbp->mtx);
587 rp->rc_tcpseq = seq;
588 if (rp->rc_acked != RC_NO_ACK)
589 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
590 rp->rc_acked = RC_NO_ACK;
591 mtx_unlock(&hbp->mtx);
592 }
593 nfsrc_unlock(rp);
594 }
595
596 /*
597 * Get a cache entry for TCP
598 * - key on <xid, nfs version>
599 * (allow multiple entries for a given key)
600 */
601 static int
602 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
603 {
604 struct nfsrvcache *rp, *nextrp;
605 int i;
606 struct nfsrvcache *hitrp;
607 struct nfsrvhashhead *hp, nfsrc_templist;
608 int hit, ret = 0;
609 struct mtx *mutex;
610
611 mutex = nfsrc_cachemutex(newrp);
612 hp = NFSRCHASH(newrp->rc_xid);
613 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
614 tryagain:
615 mtx_lock(mutex);
616 hit = 1;
617 LIST_INIT(&nfsrc_templist);
618 /*
619 * Get all the matches and put them on the temp list.
620 */
621 rp = LIST_FIRST(hp);
622 while (rp != LIST_END(hp)) {
623 nextrp = LIST_NEXT(rp, rc_hash);
624 if (newrp->rc_xid == rp->rc_xid &&
625 (!(rp->rc_flag & RC_INPROG) ||
626 ((newrp->rc_flag & RC_SAMETCPCONN) &&
627 newrp->rc_sockref == rp->rc_sockref)) &&
628 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
629 newrp->rc_proc == rp->rc_proc &&
630 ((newrp->rc_flag & RC_NFSV4) &&
631 newrp->rc_sockref != rp->rc_sockref &&
632 newrp->rc_cachetime >= rp->rc_cachetime)
633 && newrp->rc_reqlen == rp->rc_reqlen &&
634 newrp->rc_cksum == rp->rc_cksum) {
635 LIST_REMOVE(rp, rc_hash);
636 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
637 }
638 rp = nextrp;
639 }
640
641 /*
642 * Now, use nfsrc_templist to decide if there is a match.
643 */
644 i = 0;
645 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
646 i++;
647 if (rp->rc_refcnt > 0) {
648 hit = 0;
649 break;
650 }
651 }
652 /*
653 * Can be a hit only if one entry left.
654 * Note possible hit entry and put nfsrc_templist back on hash
655 * list.
656 */
657 if (i != 1)
658 hit = 0;
659 hitrp = rp = LIST_FIRST(&nfsrc_templist);
660 while (rp != LIST_END(&nfsrc_templist)) {
661 nextrp = LIST_NEXT(rp, rc_hash);
662 LIST_REMOVE(rp, rc_hash);
663 LIST_INSERT_HEAD(hp, rp, rc_hash);
664 rp = nextrp;
665 }
666 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
667 panic("nfs gettcp cache templist");
668
669 if (hit) {
670 rp = hitrp;
671 if ((rp->rc_flag & RC_LOCKED) != 0) {
672 rp->rc_flag |= RC_WANTED;
673 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
674 "nfsrc", 10 * hz);
675 goto tryagain;
676 }
677 if (rp->rc_flag == 0)
678 panic("nfs tcp cache0");
679 rp->rc_flag |= RC_LOCKED;
680 if (rp->rc_flag & RC_INPROG) {
681 nfsstatsv1.srvcache_inproghits++;
682 mtx_unlock(mutex);
683 if (newrp->rc_sockref == rp->rc_sockref)
684 nfsrc_marksametcpconn(rp->rc_sockref);
685 ret = RC_DROPIT;
686 } else if (rp->rc_flag & RC_REPSTATUS) {
687 /*
688 * V2 only.
689 */
690 nfsstatsv1.srvcache_nonidemdonehits++;
691 mtx_unlock(mutex);
692 if (newrp->rc_sockref == rp->rc_sockref)
693 nfsrc_marksametcpconn(rp->rc_sockref);
694 ret = RC_REPLY;
695 nfsrvd_rephead(nd);
696 *(nd->nd_errp) = rp->rc_status;
697 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
698 } else if (rp->rc_flag & RC_REPMBUF) {
699 nfsstatsv1.srvcache_nonidemdonehits++;
700 mtx_unlock(mutex);
701 if (newrp->rc_sockref == rp->rc_sockref)
702 nfsrc_marksametcpconn(rp->rc_sockref);
703 ret = RC_REPLY;
704 nd->nd_mreq = m_copym(rp->rc_reply, 0,
705 M_COPYALL, M_WAITOK);
706 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707 } else {
708 panic("nfs tcp cache1");
709 }
710 nfsrc_unlock(rp);
711 free(newrp, M_NFSRVCACHE);
712 goto out;
713 }
714 nfsstatsv1.srvcache_misses++;
715 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
716
717 /*
718 * For TCP, multiple entries for a key are allowed, so don't
719 * chain it into the hash table until done.
720 */
721 newrp->rc_cachetime = NFSD_MONOSEC;
722 newrp->rc_flag |= RC_INPROG;
723 LIST_INSERT_HEAD(hp, newrp, rc_hash);
724 mtx_unlock(mutex);
725 nd->nd_rp = newrp;
726 ret = RC_DOIT;
727
728 out:
729 NFSEXITCODE2(0, nd);
730 return (ret);
731 }
732
733 /*
734 * Lock a cache entry.
735 */
736 static void
737 nfsrc_lock(struct nfsrvcache *rp)
738 {
739 struct mtx *mutex;
740
741 mutex = nfsrc_cachemutex(rp);
742 mtx_assert(mutex, MA_OWNED);
743 while ((rp->rc_flag & RC_LOCKED) != 0) {
744 rp->rc_flag |= RC_WANTED;
745 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
746 }
747 rp->rc_flag |= RC_LOCKED;
748 }
749
750 /*
751 * Unlock a cache entry.
752 */
753 static void
754 nfsrc_unlock(struct nfsrvcache *rp)
755 {
756 struct mtx *mutex;
757
758 mutex = nfsrc_cachemutex(rp);
759 mtx_lock(mutex);
760 rp->rc_flag &= ~RC_LOCKED;
761 nfsrc_wanted(rp);
762 mtx_unlock(mutex);
763 }
764
765 /*
766 * Wakeup anyone wanting entry.
767 */
768 static void
769 nfsrc_wanted(struct nfsrvcache *rp)
770 {
771 if (rp->rc_flag & RC_WANTED) {
772 rp->rc_flag &= ~RC_WANTED;
773 wakeup((caddr_t)rp);
774 }
775 }
776
777 /*
778 * Free up the entry.
779 * Must not sleep.
780 */
781 static void
782 nfsrc_freecache(struct nfsrvcache *rp)
783 {
784 struct nfsrchash_bucket *hbp;
785
786 LIST_REMOVE(rp, rc_hash);
787 if (rp->rc_flag & RC_UDP) {
788 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
789 nfsrc_udpcachesize--;
790 } else if (rp->rc_acked != RC_NO_SEQ) {
791 hbp = NFSRCAHASH(rp->rc_sockref);
792 mtx_lock(&hbp->mtx);
793 if (rp->rc_acked == RC_NO_ACK)
794 LIST_REMOVE(rp, rc_ahash);
795 mtx_unlock(&hbp->mtx);
796 }
797 nfsrc_wanted(rp);
798 if (rp->rc_flag & RC_REPMBUF) {
799 m_freem(rp->rc_reply);
800 if (!(rp->rc_flag & RC_UDP))
801 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
802 }
803 free(rp, M_NFSRVCACHE);
804 atomic_add_int(&nfsstatsv1.srvcache_size, -1);
805 }
806
807 /*
808 * Clean out the cache. Called when nfsserver module is unloaded.
809 */
810 void
811 nfsrvd_cleancache(void)
812 {
813 struct nfsrvcache *rp, *nextrp;
814 int i;
815
816 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
817 mtx_lock(&nfsrchash_table[i].mtx);
818 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
819 nfsrc_freecache(rp);
820 mtx_unlock(&nfsrchash_table[i].mtx);
821 }
822 mtx_lock(&nfsrc_udpmtx);
823 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
824 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
825 nfsrc_freecache(rp);
826 }
827 }
828 nfsstatsv1.srvcache_size = 0;
829 mtx_unlock(&nfsrc_udpmtx);
830 nfsrc_tcpsavedreplies = 0;
831 }
832
833 #define HISTSIZE 16
834 /*
835 * The basic rule is to get rid of entries that are expired.
836 */
837 void
838 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
839 {
840 struct nfsrchash_bucket *hbp;
841 struct nfsrvcache *rp, *nextrp;
842 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
843 time_t thisstamp;
844 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845 static int onethread = 0, oneslot = 0;
846
847 if (sockref != 0) {
848 hbp = NFSRCAHASH(sockref);
849 mtx_lock(&hbp->mtx);
850 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
851 if (sockref == rp->rc_sockref) {
852 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
853 rp->rc_acked = RC_ACK;
854 LIST_REMOVE(rp, rc_ahash);
855 } else if (final) {
856 rp->rc_acked = RC_NACK;
857 LIST_REMOVE(rp, rc_ahash);
858 }
859 }
860 }
861 mtx_unlock(&hbp->mtx);
862 }
863
864 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
865 return;
866 if (NFSD_MONOSEC != udp_lasttrim ||
867 nfsrc_udpcachesize >= (nfsrc_udphighwater +
868 nfsrc_udphighwater / 2)) {
869 mtx_lock(&nfsrc_udpmtx);
870 udp_lasttrim = NFSD_MONOSEC;
871 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
872 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
873 && rp->rc_refcnt == 0
874 && ((rp->rc_flag & RC_REFCNT) ||
875 udp_lasttrim > rp->rc_timestamp ||
876 nfsrc_udpcachesize > nfsrc_udphighwater))
877 nfsrc_freecache(rp);
878 }
879 mtx_unlock(&nfsrc_udpmtx);
880 }
881 if (NFSD_MONOSEC != tcp_lasttrim ||
882 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
883 force = nfsrc_tcphighwater / 4;
884 if (force > 0 &&
885 nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
886 for (i = 0; i < HISTSIZE; i++)
887 time_histo[i] = 0;
888 i = 0;
889 lastslot = NFSRVCACHE_HASHSIZE - 1;
890 } else {
891 force = 0;
892 if (NFSD_MONOSEC != tcp_lasttrim) {
893 i = 0;
894 lastslot = NFSRVCACHE_HASHSIZE - 1;
895 } else {
896 lastslot = i = oneslot;
897 if (++oneslot >= NFSRVCACHE_HASHSIZE)
898 oneslot = 0;
899 }
900 }
901 tto = nfsrc_tcptimeout;
902 tcp_lasttrim = NFSD_MONOSEC;
903 for (; i <= lastslot; i++) {
904 mtx_lock(&nfsrchash_table[i].mtx);
905 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
906 nextrp) {
907 if (!(rp->rc_flag &
908 (RC_INPROG|RC_LOCKED|RC_WANTED))
909 && rp->rc_refcnt == 0) {
910 if ((rp->rc_flag & RC_REFCNT) ||
911 tcp_lasttrim > rp->rc_timestamp ||
912 rp->rc_acked == RC_ACK) {
913 nfsrc_freecache(rp);
914 continue;
915 }
916
917 if (force == 0)
918 continue;
919 /*
920 * The timestamps range from roughly the
921 * present (tcp_lasttrim) to the present
922 * + nfsrc_tcptimeout. Generate a simple
923 * histogram of where the timeouts fall.
924 */
925 j = rp->rc_timestamp - tcp_lasttrim;
926 if (j >= tto)
927 j = HISTSIZE - 1;
928 else if (j < 0)
929 j = 0;
930 else
931 j = j * HISTSIZE / tto;
932 time_histo[j]++;
933 }
934 }
935 mtx_unlock(&nfsrchash_table[i].mtx);
936 }
937 if (force) {
938 /*
939 * Trim some more with a smaller timeout of as little
940 * as 20% of nfsrc_tcptimeout to try and get below
941 * 80% of the nfsrc_tcphighwater.
942 */
943 k = 0;
944 for (i = 0; i < (HISTSIZE - 2); i++) {
945 k += time_histo[i];
946 if (k > force)
947 break;
948 }
949 k = tto * (i + 1) / HISTSIZE;
950 if (k < 1)
951 k = 1;
952 thisstamp = tcp_lasttrim + k;
953 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
954 mtx_lock(&nfsrchash_table[i].mtx);
955 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
956 rc_hash, nextrp) {
957 if (!(rp->rc_flag &
958 (RC_INPROG|RC_LOCKED|RC_WANTED))
959 && rp->rc_refcnt == 0
960 && ((rp->rc_flag & RC_REFCNT) ||
961 thisstamp > rp->rc_timestamp ||
962 rp->rc_acked == RC_ACK))
963 nfsrc_freecache(rp);
964 }
965 mtx_unlock(&nfsrchash_table[i].mtx);
966 }
967 }
968 }
969 atomic_store_rel_int(&onethread, 0);
970 }
971
972 /*
973 * Add a seqid# reference to the cache entry.
974 */
975 void
976 nfsrvd_refcache(struct nfsrvcache *rp)
977 {
978 struct mtx *mutex;
979
980 if (rp == NULL)
981 /* For NFSv4.1, there is no cache entry. */
982 return;
983 mutex = nfsrc_cachemutex(rp);
984 mtx_lock(mutex);
985 if (rp->rc_refcnt < 0)
986 panic("nfs cache refcnt");
987 rp->rc_refcnt++;
988 mtx_unlock(mutex);
989 }
990
991 /*
992 * Dereference a seqid# cache entry.
993 */
994 void
995 nfsrvd_derefcache(struct nfsrvcache *rp)
996 {
997 struct mtx *mutex;
998
999 mutex = nfsrc_cachemutex(rp);
1000 mtx_lock(mutex);
1001 if (rp->rc_refcnt <= 0)
1002 panic("nfs cache derefcnt");
1003 rp->rc_refcnt--;
1004 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1005 nfsrc_freecache(rp);
1006 mtx_unlock(mutex);
1007 }
1008
1009 /*
1010 * Calculate the length of the mbuf list and a checksum on the first up to
1011 * NFSRVCACHE_CHECKLEN bytes.
1012 */
1013 static int
1014 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1015 {
1016 int len = 0, cklen;
1017 struct mbuf *m;
1018
1019 m = m1;
1020 while (m) {
1021 len += m->m_len;
1022 m = m->m_next;
1023 }
1024 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1025 *cksum = in_cksum(m1, cklen);
1026 return (len);
1027 }
1028
1029 /*
1030 * Mark a TCP connection that is seeing retries. Should never happen for
1031 * NFSv4.
1032 */
1033 static void
1034 nfsrc_marksametcpconn(u_int64_t sockref)
1035 {
1036 }
Cache object: da11c27c729663aecdabc3e1ffa88c64
|