1 /*-
2 * Copyright (c) 1989, 1991, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
33 */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD: src/sys/nfsclient/nfs_socket.c,v 1.164 2008/11/03 10:38:00 dfr Exp $");
37
38 /*
39 * Socket operations for use by nfs
40 */
41
42 #include "opt_inet6.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/mount.h>
51 #include <sys/mutex.h>
52 #include <sys/proc.h>
53 #include <sys/protosw.h>
54 #include <sys/signalvar.h>
55 #include <sys/syscallsubr.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/sysctl.h>
59 #include <sys/syslog.h>
60 #include <sys/vnode.h>
61
62 #include <netinet/in.h>
63 #include <netinet/tcp.h>
64
65 #include <rpc/rpcclnt.h>
66
67 #include <nfs/rpcv2.h>
68 #include <nfs/nfsproto.h>
69 #include <nfsclient/nfs.h>
70 #include <nfs/xdr_subs.h>
71 #include <nfsclient/nfsm_subs.h>
72 #include <nfsclient/nfsmount.h>
73 #include <nfsclient/nfsnode.h>
74
75 #include <nfs4client/nfs4.h>
76
77 #ifdef NFS_LEGACYRPC
78
79 #define TRUE 1
80 #define FALSE 0
81
82 static int nfs_realign_test;
83 static int nfs_realign_count;
84 static int nfs_bufpackets = 4;
85 static int nfs_reconnects;
86 static int nfs3_jukebox_delay = 10;
87 static int nfs_skip_wcc_data_onerr = 1;
88 static int fake_wchan;
89
90 SYSCTL_DECL(_vfs_nfs);
91
92 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0,
93 "Number of realign tests done");
94 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0,
95 "Number of mbuf realignments done");
96 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
97 "Buffer reservation size 2 < x < 64");
98 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
99 "Number of times the nfs client has had to reconnect");
100 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
101 "Number of seconds to delay a retry after receiving EJUKEBOX");
102 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
103 "Disable weak cache consistency checking when server returns an error");
104
105 /*
106 * There is a congestion window for outstanding rpcs maintained per mount
107 * point. The cwnd size is adjusted in roughly the way that:
108 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
109 * SIGCOMM '88". ACM, August 1988.
110 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
111 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
112 * of rpcs is in progress.
113 * (The sent count and cwnd are scaled for integer arith.)
114 * Variants of "slow start" were tried and were found to be too much of a
115 * performance hit (ave. rtt 3 times larger),
116 * I suspect due to the large rtt that nfs rpcs have.
117 */
118 #define NFS_CWNDSCALE 256
119 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
120 #define NFS_NBACKOFF 8
121 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
122 struct callout nfs_callout;
123
124 static int nfs_msg(struct thread *, const char *, const char *, int);
125 static int nfs_realign(struct mbuf **pm, int hsiz);
126 static int nfs_reply(struct nfsreq *);
127 static void nfs_softterm(struct nfsreq *rep);
128 static int nfs_reconnect(struct nfsreq *rep);
129 static void nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag);
130 static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
131
132 extern struct mtx nfs_reqq_mtx;
133
134 /*
135 * RTT estimator
136 */
137
138 static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
139 NFS_DEFAULT_TIMER, /* NULL */
140 NFS_GETATTR_TIMER, /* GETATTR */
141 NFS_DEFAULT_TIMER, /* SETATTR */
142 NFS_LOOKUP_TIMER, /* LOOKUP */
143 NFS_GETATTR_TIMER, /* ACCESS */
144 NFS_READ_TIMER, /* READLINK */
145 NFS_READ_TIMER, /* READ */
146 NFS_WRITE_TIMER, /* WRITE */
147 NFS_DEFAULT_TIMER, /* CREATE */
148 NFS_DEFAULT_TIMER, /* MKDIR */
149 NFS_DEFAULT_TIMER, /* SYMLINK */
150 NFS_DEFAULT_TIMER, /* MKNOD */
151 NFS_DEFAULT_TIMER, /* REMOVE */
152 NFS_DEFAULT_TIMER, /* RMDIR */
153 NFS_DEFAULT_TIMER, /* RENAME */
154 NFS_DEFAULT_TIMER, /* LINK */
155 NFS_READ_TIMER, /* READDIR */
156 NFS_READ_TIMER, /* READDIRPLUS */
157 NFS_DEFAULT_TIMER, /* FSSTAT */
158 NFS_DEFAULT_TIMER, /* FSINFO */
159 NFS_DEFAULT_TIMER, /* PATHCONF */
160 NFS_DEFAULT_TIMER, /* COMMIT */
161 NFS_DEFAULT_TIMER, /* NOOP */
162 };
163
164 /*
165 * Choose the correct RTT timer for this NFS procedure.
166 */
167 static inline enum nfs_rto_timer_t
168 nfs_rto_timer(u_int32_t procnum)
169 {
170 return nfs_proct[procnum];
171 }
172
173 /*
174 * Initialize the RTT estimator state for a new mount point.
175 */
176 static void
177 nfs_init_rtt(struct nfsmount *nmp)
178 {
179 int i;
180
181 for (i = 0; i < NFS_MAX_TIMER; i++)
182 nmp->nm_srtt[i] = NFS_INITRTT;
183 for (i = 0; i < NFS_MAX_TIMER; i++)
184 nmp->nm_sdrtt[i] = 0;
185 }
186
187 /*
188 * Update a mount point's RTT estimator state using data from the
189 * passed-in request.
190 *
191 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
192 *
193 * NB: Since the timer resolution of NFS_HZ is so course, it can often
194 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
195 * between N + dt and N + 2 - dt ticks, add 1 before calculating the
196 * update values.
197 */
198 static void
199 nfs_update_rtt(struct nfsreq *rep)
200 {
201 int t1 = rep->r_rtt + 1;
202 int index = nfs_rto_timer(rep->r_procnum) - 1;
203 int *srtt = &rep->r_nmp->nm_srtt[index];
204 int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
205
206 t1 -= *srtt >> 3;
207 *srtt += t1;
208 if (t1 < 0)
209 t1 = -t1;
210 t1 -= *sdrtt >> 2;
211 *sdrtt += t1;
212 }
213
214 /*
215 * Estimate RTO for an NFS RPC sent via an unreliable datagram.
216 *
217 * Use the mean and mean deviation of RTT for the appropriate type
218 * of RPC for the frequent RPCs and a default for the others.
219 * The justification for doing "other" this way is that these RPCs
220 * happen so infrequently that timer est. would probably be stale.
221 * Also, since many of these RPCs are non-idempotent, a conservative
222 * timeout is desired.
223 *
224 * getattr, lookup - A+2D
225 * read, write - A+4D
226 * other - nm_timeo
227 */
228 static int
229 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
230 {
231 enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
232 int index = timer - 1;
233 int rto;
234
235 switch (timer) {
236 case NFS_GETATTR_TIMER:
237 case NFS_LOOKUP_TIMER:
238 rto = ((nmp->nm_srtt[index] + 3) >> 2) +
239 ((nmp->nm_sdrtt[index] + 1) >> 1);
240 break;
241 case NFS_READ_TIMER:
242 case NFS_WRITE_TIMER:
243 rto = ((nmp->nm_srtt[index] + 7) >> 3) +
244 (nmp->nm_sdrtt[index] + 1);
245 break;
246 default:
247 rto = nmp->nm_timeo;
248 return (rto);
249 }
250
251 if (rto < NFS_MINRTO)
252 rto = NFS_MINRTO;
253 else if (rto > NFS_MAXRTO)
254 rto = NFS_MAXRTO;
255
256 return (rto);
257 }
258
259
260 /*
261 * Initialize sockets and congestion for a new NFS connection.
262 * We do not free the sockaddr if error.
263 */
264 int
265 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
266 {
267 struct socket *so;
268 int error, rcvreserve, sndreserve;
269 int pktscale;
270 struct sockaddr *saddr;
271 struct ucred *origcred;
272 struct thread *td = curthread;
273
274 /*
275 * We need to establish the socket using the credentials of
276 * the mountpoint. Some parts of this process (such as
277 * sobind() and soconnect()) will use the curent thread's
278 * credential instead of the socket credential. To work
279 * around this, temporarily change the current thread's
280 * credential to that of the mountpoint.
281 *
282 * XXX: It would be better to explicitly pass the correct
283 * credential to sobind() and soconnect().
284 */
285 origcred = td->td_ucred;
286 td->td_ucred = nmp->nm_mountp->mnt_cred;
287
288 if (nmp->nm_sotype == SOCK_STREAM) {
289 mtx_lock(&nmp->nm_mtx);
290 nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
291 nmp->nm_nfstcpstate.rpcresid = 0;
292 mtx_unlock(&nmp->nm_mtx);
293 }
294 nmp->nm_so = NULL;
295 saddr = nmp->nm_nam;
296 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
297 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
298 if (error)
299 goto bad;
300 so = nmp->nm_so;
301 nmp->nm_soflags = so->so_proto->pr_flags;
302
303 /*
304 * Some servers require that the client port be a reserved port number.
305 */
306 if (nmp->nm_flag & NFSMNT_RESVPORT) {
307 struct sockopt sopt;
308 int ip, ip2, len;
309 struct sockaddr_in6 ssin;
310 struct sockaddr *sa;
311
312 bzero(&sopt, sizeof sopt);
313 switch(saddr->sa_family) {
314 case AF_INET:
315 sopt.sopt_level = IPPROTO_IP;
316 sopt.sopt_name = IP_PORTRANGE;
317 ip = IP_PORTRANGE_LOW;
318 ip2 = IP_PORTRANGE_DEFAULT;
319 len = sizeof (struct sockaddr_in);
320 break;
321 #ifdef INET6
322 case AF_INET6:
323 sopt.sopt_level = IPPROTO_IPV6;
324 sopt.sopt_name = IPV6_PORTRANGE;
325 ip = IPV6_PORTRANGE_LOW;
326 ip2 = IPV6_PORTRANGE_DEFAULT;
327 len = sizeof (struct sockaddr_in6);
328 break;
329 #endif
330 default:
331 goto noresvport;
332 }
333 sa = (struct sockaddr *)&ssin;
334 bzero(sa, len);
335 sa->sa_len = len;
336 sa->sa_family = saddr->sa_family;
337 sopt.sopt_dir = SOPT_SET;
338 sopt.sopt_val = (void *)&ip;
339 sopt.sopt_valsize = sizeof(ip);
340 error = sosetopt(so, &sopt);
341 if (error)
342 goto bad;
343 error = sobind(so, sa, td);
344 if (error)
345 goto bad;
346 ip = ip2;
347 error = sosetopt(so, &sopt);
348 if (error)
349 goto bad;
350 noresvport: ;
351 }
352
353 /*
354 * Protocols that do not require connections may be optionally left
355 * unconnected for servers that reply from a port other than NFS_PORT.
356 */
357 mtx_lock(&nmp->nm_mtx);
358 if (nmp->nm_flag & NFSMNT_NOCONN) {
359 if (nmp->nm_soflags & PR_CONNREQUIRED) {
360 error = ENOTCONN;
361 mtx_unlock(&nmp->nm_mtx);
362 goto bad;
363 } else
364 mtx_unlock(&nmp->nm_mtx);
365 } else {
366 mtx_unlock(&nmp->nm_mtx);
367 error = soconnect(so, nmp->nm_nam, td);
368 if (error)
369 goto bad;
370
371 /*
372 * Wait for the connection to complete. Cribbed from the
373 * connect system call but with the wait timing out so
374 * that interruptible mounts don't hang here for a long time.
375 */
376 SOCK_LOCK(so);
377 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
378 (void) msleep(&so->so_timeo, SOCK_MTX(so),
379 PSOCK, "nfscon", 2 * hz);
380 if ((so->so_state & SS_ISCONNECTING) &&
381 so->so_error == 0 && rep &&
382 (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
383 so->so_state &= ~SS_ISCONNECTING;
384 SOCK_UNLOCK(so);
385 goto bad;
386 }
387 }
388 if (so->so_error) {
389 error = so->so_error;
390 so->so_error = 0;
391 SOCK_UNLOCK(so);
392 goto bad;
393 }
394 SOCK_UNLOCK(so);
395 }
396 so->so_rcv.sb_timeo = 12 * hz;
397 if (nmp->nm_sotype == SOCK_STREAM)
398 so->so_snd.sb_timeo = 1 * hz; /* 1s snd timeout for NFS/TCP */
399 else
400 so->so_snd.sb_timeo = 5 * hz;
401
402 /*
403 * Get buffer reservation size from sysctl, but impose reasonable
404 * limits.
405 */
406 pktscale = nfs_bufpackets;
407 if (pktscale < 2)
408 pktscale = 2;
409 if (pktscale > 64)
410 pktscale = 64;
411 mtx_lock(&nmp->nm_mtx);
412 if (nmp->nm_sotype == SOCK_DGRAM) {
413 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
414 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
415 NFS_MAXPKTHDR) * pktscale;
416 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
417 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
418 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
419 NFS_MAXPKTHDR) * pktscale;
420 } else {
421 if (nmp->nm_sotype != SOCK_STREAM)
422 panic("nfscon sotype");
423 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
424 struct sockopt sopt;
425 int val;
426
427 bzero(&sopt, sizeof sopt);
428 sopt.sopt_dir = SOPT_SET;
429 sopt.sopt_level = SOL_SOCKET;
430 sopt.sopt_name = SO_KEEPALIVE;
431 sopt.sopt_val = &val;
432 sopt.sopt_valsize = sizeof val;
433 val = 1;
434 mtx_unlock(&nmp->nm_mtx);
435 sosetopt(so, &sopt);
436 mtx_lock(&nmp->nm_mtx);
437 }
438 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
439 struct sockopt sopt;
440 int val;
441
442 bzero(&sopt, sizeof sopt);
443 sopt.sopt_dir = SOPT_SET;
444 sopt.sopt_level = IPPROTO_TCP;
445 sopt.sopt_name = TCP_NODELAY;
446 sopt.sopt_val = &val;
447 sopt.sopt_valsize = sizeof val;
448 val = 1;
449 mtx_unlock(&nmp->nm_mtx);
450 sosetopt(so, &sopt);
451 mtx_lock(&nmp->nm_mtx);
452 }
453 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
454 sizeof (u_int32_t)) * pktscale;
455 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
456 sizeof (u_int32_t)) * pktscale;
457 }
458 mtx_unlock(&nmp->nm_mtx);
459 error = soreserve(so, sndreserve, rcvreserve);
460 if (error)
461 goto bad;
462 SOCKBUF_LOCK(&so->so_rcv);
463 so->so_rcv.sb_flags |= SB_NOINTR;
464 so->so_upcallarg = (caddr_t)nmp;
465 if (so->so_type == SOCK_STREAM)
466 so->so_upcall = nfs_clnt_tcp_soupcall;
467 else
468 so->so_upcall = nfs_clnt_udp_soupcall;
469 so->so_rcv.sb_flags |= SB_UPCALL;
470 SOCKBUF_UNLOCK(&so->so_rcv);
471 SOCKBUF_LOCK(&so->so_snd);
472 so->so_snd.sb_flags |= SB_NOINTR;
473 SOCKBUF_UNLOCK(&so->so_snd);
474
475 /* Restore current thread's credentials. */
476 td->td_ucred = origcred;
477
478 mtx_lock(&nmp->nm_mtx);
479 /* Initialize other non-zero congestion variables */
480 nfs_init_rtt(nmp);
481 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
482 nmp->nm_sent = 0;
483 nmp->nm_timeouts = 0;
484 mtx_unlock(&nmp->nm_mtx);
485 return (0);
486
487 bad:
488 /* Restore current thread's credentials. */
489 td->td_ucred = origcred;
490
491 nfs_disconnect(nmp);
492 return (error);
493 }
494
495 static void
496 nfs_wakup_reconnectors(struct nfsmount *nmp)
497 {
498 KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
499 if (--nmp->nm_nfstcpstate.sock_send_inprog == 0 &&
500 (nmp->nm_nfstcpstate.flags & NFS_TCP_WAIT_WRITE_DRAIN)) {
501 nmp->nm_nfstcpstate.flags &= ~NFS_TCP_WAIT_WRITE_DRAIN;
502 wakeup((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog);
503 }
504 }
505
506 /*
507 * Reconnect routine:
508 * Called when a connection is broken on a reliable protocol.
509 * - clean up the old socket
510 * - nfs_connect() again
511 * - set R_MUSTRESEND for all outstanding requests on mount point
512 * If this fails the mount point is DEAD!
513 * nb: Must be called with the nfs_sndlock() set on the mount point.
514 */
515 static int
516 nfs_reconnect(struct nfsreq *rep)
517 {
518 struct nfsreq *rp;
519 struct nfsmount *nmp = rep->r_nmp;
520 int error;
521 int slpflag = 0;
522
523 KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
524 if (nmp->nm_flag & NFSMNT_INT)
525 slpflag = PCATCH;
526 /*
527 * Wait for any pending writes to this socket to drain (or timeout).
528 */
529 while (nmp->nm_nfstcpstate.sock_send_inprog > 0) {
530 nmp->nm_nfstcpstate.flags |= NFS_TCP_WAIT_WRITE_DRAIN;
531 error = msleep((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog,
532 &nmp->nm_mtx, slpflag | (PZERO - 1), "nfscon", 0);
533 }
534 /*
535 * Grab the nfs_connect_lock to serialize connects.
536 * After grabbing the nfs_connect_lock, check if a reconnect is necessary or
537 * if someone else beat us to the connect !
538 */
539 error = nfs_connect_lock(rep);
540 if (error)
541 goto unlock_exit;
542 if ((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) == 0)
543 goto unlock_exit;
544 else
545 mtx_unlock(&nmp->nm_mtx);
546
547 nfs_reconnects++;
548 nfs_disconnect(nmp);
549 while ((error = nfs_connect(nmp, rep)) != 0) {
550 if (error == ERESTART)
551 error = EINTR;
552 if (error == EIO || error == EINTR) {
553 mtx_lock(&nmp->nm_mtx);
554 goto unlock_exit;
555 }
556 (void) tsleep(&fake_wchan, PSOCK, "nfscon", hz);
557 }
558
559 /*
560 * Clear the FORCE_RECONNECT flag only after the connect
561 * succeeds. To prevent races between multiple processes
562 * waiting on the mountpoint where the connection is being
563 * torn down. The first one to acquire the sndlock will
564 * retry the connection. The others block on the sndlock
565 * until the connection is established successfully, and
566 * then re-transmit the request.
567 */
568 mtx_lock(&nmp->nm_mtx);
569 nmp->nm_nfstcpstate.flags &= ~NFS_TCP_FORCE_RECONNECT;
570 nmp->nm_nfstcpstate.rpcresid = 0;
571 mtx_unlock(&nmp->nm_mtx);
572
573 /*
574 * Loop through outstanding request list and fix up all requests
575 * on old socket.
576 */
577 mtx_lock(&nfs_reqq_mtx);
578 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
579 if (rp->r_nmp == nmp) {
580 mtx_lock(&rp->r_mtx);
581 rp->r_flags |= R_MUSTRESEND;
582 mtx_unlock(&rp->r_mtx);
583 }
584 }
585 mtx_unlock(&nfs_reqq_mtx);
586 mtx_lock(&nmp->nm_mtx);
587 unlock_exit:
588 nfs_connect_unlock(rep);
589 mtx_unlock(&nmp->nm_mtx);
590 return (error);
591 }
592
593 /*
594 * NFS disconnect. Clean up and unlink.
595 */
596 void
597 nfs_disconnect(struct nfsmount *nmp)
598 {
599 struct socket *so;
600
601 mtx_lock(&nmp->nm_mtx);
602 if (nmp->nm_so) {
603 so = nmp->nm_so;
604 nmp->nm_so = NULL;
605 mtx_unlock(&nmp->nm_mtx);
606 SOCKBUF_LOCK(&so->so_rcv);
607 so->so_upcallarg = NULL;
608 so->so_upcall = NULL;
609 so->so_rcv.sb_flags &= ~SB_UPCALL;
610 SOCKBUF_UNLOCK(&so->so_rcv);
611 soshutdown(so, SHUT_WR);
612 soclose(so);
613 } else
614 mtx_unlock(&nmp->nm_mtx);
615 }
616
617 void
618 nfs_safedisconnect(struct nfsmount *nmp)
619 {
620 struct nfsreq dummyreq;
621
622 bzero(&dummyreq, sizeof(dummyreq));
623 dummyreq.r_nmp = nmp;
624 nfs_disconnect(nmp);
625 }
626
627 /*
628 * This is the nfs send routine. For connection based socket types, it
629 * must be called with an nfs_sndlock() on the socket.
630 * - return EINTR if the RPC is terminated, 0 otherwise
631 * - set R_MUSTRESEND if the send fails for any reason
632 * - do any cleanup required by recoverable socket errors (?)
633 */
634 int
635 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
636 struct nfsreq *rep)
637 {
638 struct sockaddr *sendnam;
639 int error, error2, soflags, flags;
640
641 KASSERT(rep, ("nfs_send: called with rep == NULL"));
642
643 error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
644 if (error) {
645 m_freem(top);
646 return (error);
647 }
648 mtx_lock(&rep->r_nmp->nm_mtx);
649 mtx_lock(&rep->r_mtx);
650 if ((so = rep->r_nmp->nm_so) == NULL) {
651 rep->r_flags |= R_MUSTRESEND;
652 mtx_unlock(&rep->r_mtx);
653 mtx_unlock(&rep->r_nmp->nm_mtx);
654 m_freem(top);
655 return (EPIPE);
656 }
657 rep->r_flags &= ~R_MUSTRESEND;
658 soflags = rep->r_nmp->nm_soflags;
659 mtx_unlock(&rep->r_mtx);
660 mtx_unlock(&rep->r_nmp->nm_mtx);
661
662 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
663 sendnam = NULL;
664 else
665 sendnam = nam;
666 if (so->so_type == SOCK_SEQPACKET)
667 flags = MSG_EOR;
668 else
669 flags = 0;
670
671 error = sosend(so, sendnam, 0, top, 0, flags, curthread /*XXX*/);
672 if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
673 error = 0;
674 mtx_lock(&rep->r_mtx);
675 rep->r_flags |= R_MUSTRESEND;
676 mtx_unlock(&rep->r_mtx);
677 }
678
679 if (error) {
680 /*
681 * Don't report EPIPE errors on nfs sockets.
682 * These can be due to idle tcp mounts which will be closed by
683 * netapp, solaris, etc. if left idle too long.
684 */
685 if (error != EPIPE) {
686 log(LOG_INFO, "nfs send error %d for server %s\n",
687 error,
688 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
689 }
690 /*
691 * Deal with errors for the client side.
692 */
693 error2 = NFS_SIGREP(rep);
694 if (error2)
695 error = error2;
696 else {
697 mtx_lock(&rep->r_mtx);
698 rep->r_flags |= R_MUSTRESEND;
699 mtx_unlock(&rep->r_mtx);
700 }
701
702 /*
703 * Handle any recoverable (soft) socket errors here. (?)
704 * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer().
705 */
706 if (error != EINTR && error != ERESTART && error != EIO && error != EPIPE)
707 error = 0;
708 }
709 return (error);
710 }
711
712 int
713 nfs_reply(struct nfsreq *rep)
714 {
715 register struct socket *so;
716 register struct mbuf *m;
717 int error = 0, sotype, slpflag;
718 struct nfsmount *nmp = rep->r_nmp;
719
720 sotype = nmp->nm_sotype;
721 /*
722 * For reliable protocols, lock against other senders/receivers
723 * in case a reconnect is necessary.
724 */
725 if (sotype != SOCK_DGRAM) {
726 tryagain:
727 mtx_lock(&nmp->nm_mtx);
728 mtx_lock(&rep->r_mtx);
729 if (rep->r_mrep) {
730 mtx_unlock(&rep->r_mtx);
731 mtx_unlock(&nmp->nm_mtx);
732 return (0);
733 }
734 if (rep->r_flags & R_SOFTTERM) {
735 mtx_unlock(&rep->r_mtx);
736 mtx_unlock(&nmp->nm_mtx);
737 return (EINTR);
738 }
739 so = nmp->nm_so;
740 if (!so ||
741 (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
742 mtx_unlock(&rep->r_mtx);
743 nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
744 error = nfs_reconnect(rep);
745 if (error)
746 return (error);
747 goto tryagain;
748 }
749 while (rep->r_flags & R_MUSTRESEND) {
750 mtx_unlock(&rep->r_mtx);
751 nmp->nm_nfstcpstate.sock_send_inprog++;
752 mtx_unlock(&nmp->nm_mtx);
753 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
754 nfsstats.rpcretries++;
755 error = nfs_send(so, nmp->nm_nam, m, rep);
756 if (error) {
757 mtx_lock(&nmp->nm_mtx);
758 nfs_wakup_reconnectors(nmp);
759 if (!(error == EINTR || error == ERESTART)) {
760 nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
761 error = nfs_reconnect(rep);
762 } else
763 mtx_unlock(&nmp->nm_mtx);
764 if (error)
765 return (error);
766 goto tryagain;
767 } else {
768 mtx_lock(&nmp->nm_mtx);
769 nfs_wakup_reconnectors(nmp);
770 mtx_lock(&rep->r_mtx);
771 }
772 }
773 mtx_unlock(&rep->r_mtx);
774 mtx_unlock(&nmp->nm_mtx);
775 }
776 slpflag = 0;
777 mtx_lock(&nmp->nm_mtx);
778 if (nmp->nm_flag & NFSMNT_INT)
779 slpflag = PCATCH;
780 mtx_unlock(&nmp->nm_mtx);
781 mtx_lock(&rep->r_mtx);
782 while ((rep->r_mrep == NULL) && (error == 0) &&
783 ((rep->r_flags & R_SOFTTERM) == 0) &&
784 ((sotype == SOCK_DGRAM) || ((rep->r_flags & R_MUSTRESEND) == 0)))
785 error = msleep((caddr_t)rep, &rep->r_mtx,
786 slpflag | (PZERO - 1), "nfsreq", 0);
787 if (error == EINTR || error == ERESTART) {
788 /* NFS operations aren't restartable. Map ERESTART to EINTR */
789 mtx_unlock(&rep->r_mtx);
790 return (EINTR);
791 }
792 if (rep->r_flags & R_SOFTTERM) {
793 /* Request was terminated because we exceeded the retries (soft mount) */
794 mtx_unlock(&rep->r_mtx);
795 return (ETIMEDOUT);
796 }
797 mtx_unlock(&rep->r_mtx);
798 if (sotype == SOCK_STREAM) {
799 mtx_lock(&nmp->nm_mtx);
800 mtx_lock(&rep->r_mtx);
801 if (((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) ||
802 (rep->r_flags & R_MUSTRESEND))) {
803 mtx_unlock(&rep->r_mtx);
804 mtx_unlock(&nmp->nm_mtx);
805 goto tryagain;
806 } else {
807 mtx_unlock(&rep->r_mtx);
808 mtx_unlock(&nmp->nm_mtx);
809 }
810 }
811 return (error);
812 }
813
814 /*
815 * XXX TO DO
816 * Make nfs_realign() non-blocking. Also make nfsm_dissect() nonblocking.
817 */
818 static void
819 nfs_clnt_match_xid(struct socket *so,
820 struct nfsmount *nmp,
821 struct mbuf *mrep)
822 {
823 struct mbuf *md;
824 caddr_t dpos;
825 u_int32_t rxid, *tl;
826 struct nfsreq *rep;
827 int error;
828
829 /*
830 * Search for any mbufs that are not a multiple of 4 bytes long
831 * or with m_data not longword aligned.
832 * These could cause pointer alignment problems, so copy them to
833 * well aligned mbufs.
834 */
835 if (nfs_realign(&mrep, 5 * NFSX_UNSIGNED) == ENOMEM) {
836 m_freem(mrep);
837 nfsstats.rpcinvalid++;
838 return;
839 }
840
841 /*
842 * Get the xid and check that it is an rpc reply
843 */
844 md = mrep;
845 dpos = mtod(md, caddr_t);
846 tl = nfsm_dissect_nonblock(u_int32_t *, 2*NFSX_UNSIGNED);
847 rxid = *tl++;
848 if (*tl != rpc_reply) {
849 m_freem(mrep);
850 nfsmout:
851 nfsstats.rpcinvalid++;
852 return;
853 }
854
855 mtx_lock(&nfs_reqq_mtx);
856 /*
857 * Loop through the request list to match up the reply
858 * Iff no match, just drop the datagram
859 */
860 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
861 mtx_lock(&nmp->nm_mtx);
862 mtx_lock(&rep->r_mtx);
863 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
864 /* Found it.. */
865 rep->r_mrep = mrep;
866 rep->r_md = md;
867 rep->r_dpos = dpos;
868 /*
869 * Update congestion window.
870 * Do the additive increase of
871 * one rpc/rtt.
872 */
873 if (nmp->nm_cwnd <= nmp->nm_sent) {
874 nmp->nm_cwnd +=
875 (NFS_CWNDSCALE * NFS_CWNDSCALE +
876 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
877 if (nmp->nm_cwnd > NFS_MAXCWND)
878 nmp->nm_cwnd = NFS_MAXCWND;
879 }
880 if (rep->r_flags & R_SENT) {
881 rep->r_flags &= ~R_SENT;
882 nmp->nm_sent -= NFS_CWNDSCALE;
883 }
884 if (rep->r_flags & R_TIMING)
885 nfs_update_rtt(rep);
886 nmp->nm_timeouts = 0;
887 wakeup((caddr_t)rep);
888 mtx_unlock(&rep->r_mtx);
889 mtx_unlock(&nmp->nm_mtx);
890 break;
891 }
892 mtx_unlock(&rep->r_mtx);
893 mtx_unlock(&nmp->nm_mtx);
894 }
895 /*
896 * If not matched to a request, drop it.
897 * If it's mine, wake up requestor.
898 */
899 if (rep == 0) {
900 nfsstats.rpcunexpected++;
901 m_freem(mrep);
902 }
903 mtx_unlock(&nfs_reqq_mtx);
904 }
905
906 static void
907 nfs_mark_for_reconnect(struct nfsmount *nmp)
908 {
909 struct nfsreq *rp;
910
911 mtx_lock(&nmp->nm_mtx);
912 nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
913 mtx_unlock(&nmp->nm_mtx);
914 /*
915 * Wakeup all processes that are waiting for replies
916 * on this mount point. One of them does the reconnect.
917 */
918 mtx_lock(&nfs_reqq_mtx);
919 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
920 if (rp->r_nmp == nmp) {
921 mtx_lock(&rp->r_mtx);
922 rp->r_flags |= R_MUSTRESEND;
923 wakeup((caddr_t)rp);
924 mtx_unlock(&rp->r_mtx);
925 }
926 }
927 mtx_unlock(&nfs_reqq_mtx);
928 }
929
930 static int
931 nfstcp_readable(struct socket *so, int bytes)
932 {
933 int retval;
934
935 SOCKBUF_LOCK(&so->so_rcv);
936 retval = (so->so_rcv.sb_cc >= (bytes) ||
937 (so->so_rcv.sb_state & SBS_CANTRCVMORE) ||
938 so->so_error);
939 SOCKBUF_UNLOCK(&so->so_rcv);
940 return (retval);
|