[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]

FreeBSD/Linux Kernel Cross Reference
sys/nfsclient/nfs_socket.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD70  -  FREEBSD6  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  FREEBSD-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  excerpts  -  bigexcerpts 

  1 /*-
  2  * Copyright (c) 1989, 1991, 1993, 1995
  3  *      The Regents of the University of California.  All rights reserved.
  4  *
  5  * This code is derived from software contributed to Berkeley by
  6  * Rick Macklem at The University of Guelph.
  7  *
  8  * Redistribution and use in source and binary forms, with or without
  9  * modification, are permitted provided that the following conditions
 10  * are met:
 11  * 1. Redistributions of source code must retain the above copyright
 12  *    notice, this list of conditions and the following disclaimer.
 13  * 2. Redistributions in binary form must reproduce the above copyright
 14  *    notice, this list of conditions and the following disclaimer in the
 15  *    documentation and/or other materials provided with the distribution.
 16  * 4. Neither the name of the University nor the names of its contributors
 17  *    may be used to endorse or promote products derived from this software
 18  *    without specific prior written permission.
 19  *
 20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 30  * SUCH DAMAGE.
 31  *
 32  *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
 33  */
 34 
 35 #include <sys/cdefs.h>
 36 __FBSDID("$FreeBSD: src/sys/nfsclient/nfs_socket.c,v 1.164 2008/11/03 10:38:00 dfr Exp $");
 37 
 38 /*
 39  * Socket operations for use by nfs
 40  */
 41 
 42 #include "opt_inet6.h"
 43 
 44 #include <sys/param.h>
 45 #include <sys/systm.h>
 46 #include <sys/kernel.h>
 47 #include <sys/lock.h>
 48 #include <sys/malloc.h>
 49 #include <sys/mbuf.h>
 50 #include <sys/mount.h>
 51 #include <sys/mutex.h>
 52 #include <sys/proc.h>
 53 #include <sys/protosw.h>
 54 #include <sys/signalvar.h>
 55 #include <sys/syscallsubr.h>
 56 #include <sys/socket.h>
 57 #include <sys/socketvar.h>
 58 #include <sys/sysctl.h>
 59 #include <sys/syslog.h>
 60 #include <sys/vnode.h>
 61 
 62 #include <netinet/in.h>
 63 #include <netinet/tcp.h>
 64 
 65 #include <rpc/rpcclnt.h>
 66 
 67 #include <nfs/rpcv2.h>
 68 #include <nfs/nfsproto.h>
 69 #include <nfsclient/nfs.h>
 70 #include <nfs/xdr_subs.h>
 71 #include <nfsclient/nfsm_subs.h>
 72 #include <nfsclient/nfsmount.h>
 73 #include <nfsclient/nfsnode.h>
 74 
 75 #include <nfs4client/nfs4.h>
 76 
 77 #ifdef NFS_LEGACYRPC
 78 
 79 #define TRUE    1
 80 #define FALSE   0
 81 
 82 static int      nfs_realign_test;
 83 static int      nfs_realign_count;
 84 static int      nfs_bufpackets = 4;
 85 static int      nfs_reconnects;
 86 static int      nfs3_jukebox_delay = 10;
 87 static int      nfs_skip_wcc_data_onerr = 1;
 88 static int      fake_wchan;
 89 
 90 SYSCTL_DECL(_vfs_nfs);
 91 
 92 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0,
 93     "Number of realign tests done");
 94 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0,
 95     "Number of mbuf realignments done");
 96 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0,
 97     "Buffer reservation size 2 < x < 64");
 98 SYSCTL_INT(_vfs_nfs, OID_AUTO, reconnects, CTLFLAG_RD, &nfs_reconnects, 0,
 99     "Number of times the nfs client has had to reconnect");
100 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs3_jukebox_delay, CTLFLAG_RW, &nfs3_jukebox_delay, 0,
101     "Number of seconds to delay a retry after receiving EJUKEBOX");
102 SYSCTL_INT(_vfs_nfs, OID_AUTO, skip_wcc_data_onerr, CTLFLAG_RW, &nfs_skip_wcc_data_onerr, 0,
103     "Disable weak cache consistency checking when server returns an error");
104 
105 /*
106  * There is a congestion window for outstanding rpcs maintained per mount
107  * point. The cwnd size is adjusted in roughly the way that:
108  * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
109  * SIGCOMM '88". ACM, August 1988.
110  * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
111  * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
112  * of rpcs is in progress.
113  * (The sent count and cwnd are scaled for integer arith.)
114  * Variants of "slow start" were tried and were found to be too much of a
115  * performance hit (ave. rtt 3 times larger),
116  * I suspect due to the large rtt that nfs rpcs have.
117  */
118 #define NFS_CWNDSCALE   256
119 #define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
120 #define NFS_NBACKOFF    8
121 static int nfs_backoff[NFS_NBACKOFF] = { 2, 4, 8, 16, 32, 64, 128, 256, };
122 struct callout  nfs_callout;
123 
124 static int      nfs_msg(struct thread *, const char *, const char *, int);
125 static int      nfs_realign(struct mbuf **pm, int hsiz);
126 static int      nfs_reply(struct nfsreq *);
127 static void     nfs_softterm(struct nfsreq *rep);
128 static int      nfs_reconnect(struct nfsreq *rep);
129 static void nfs_clnt_tcp_soupcall(struct socket *so, void *arg, int waitflag);
130 static void nfs_clnt_udp_soupcall(struct socket *so, void *arg, int waitflag);
131 
132 extern struct mtx nfs_reqq_mtx;
133 
134 /*
135  * RTT estimator
136  */
137 
138 static enum nfs_rto_timer_t nfs_proct[NFS_NPROCS] = {
139         NFS_DEFAULT_TIMER,      /* NULL */
140         NFS_GETATTR_TIMER,      /* GETATTR */
141         NFS_DEFAULT_TIMER,      /* SETATTR */
142         NFS_LOOKUP_TIMER,       /* LOOKUP */
143         NFS_GETATTR_TIMER,      /* ACCESS */
144         NFS_READ_TIMER,         /* READLINK */
145         NFS_READ_TIMER,         /* READ */
146         NFS_WRITE_TIMER,        /* WRITE */
147         NFS_DEFAULT_TIMER,      /* CREATE */
148         NFS_DEFAULT_TIMER,      /* MKDIR */
149         NFS_DEFAULT_TIMER,      /* SYMLINK */
150         NFS_DEFAULT_TIMER,      /* MKNOD */
151         NFS_DEFAULT_TIMER,      /* REMOVE */
152         NFS_DEFAULT_TIMER,      /* RMDIR */
153         NFS_DEFAULT_TIMER,      /* RENAME */
154         NFS_DEFAULT_TIMER,      /* LINK */
155         NFS_READ_TIMER,         /* READDIR */
156         NFS_READ_TIMER,         /* READDIRPLUS */
157         NFS_DEFAULT_TIMER,      /* FSSTAT */
158         NFS_DEFAULT_TIMER,      /* FSINFO */
159         NFS_DEFAULT_TIMER,      /* PATHCONF */
160         NFS_DEFAULT_TIMER,      /* COMMIT */
161         NFS_DEFAULT_TIMER,      /* NOOP */
162 };
163 
164 /*
165  * Choose the correct RTT timer for this NFS procedure.
166  */
167 static inline enum nfs_rto_timer_t
168 nfs_rto_timer(u_int32_t procnum)
169 {
170         return nfs_proct[procnum];
171 }
172 
173 /*
174  * Initialize the RTT estimator state for a new mount point.
175  */
176 static void
177 nfs_init_rtt(struct nfsmount *nmp)
178 {
179         int i;
180 
181         for (i = 0; i < NFS_MAX_TIMER; i++)
182                 nmp->nm_srtt[i] = NFS_INITRTT;
183         for (i = 0; i < NFS_MAX_TIMER; i++)
184                 nmp->nm_sdrtt[i] = 0;
185 }
186 
187 /*
188  * Update a mount point's RTT estimator state using data from the
189  * passed-in request.
190  * 
191  * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation.
192  *
193  * NB: Since the timer resolution of NFS_HZ is so course, it can often
194  * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is
195  * between N + dt and N + 2 - dt ticks, add 1 before calculating the
196  * update values.
197  */
198 static void
199 nfs_update_rtt(struct nfsreq *rep)
200 {
201         int t1 = rep->r_rtt + 1;
202         int index = nfs_rto_timer(rep->r_procnum) - 1;
203         int *srtt = &rep->r_nmp->nm_srtt[index];
204         int *sdrtt = &rep->r_nmp->nm_sdrtt[index];
205 
206         t1 -= *srtt >> 3;
207         *srtt += t1;
208         if (t1 < 0)
209                 t1 = -t1;
210         t1 -= *sdrtt >> 2;
211         *sdrtt += t1;
212 }
213 
214 /*
215  * Estimate RTO for an NFS RPC sent via an unreliable datagram.
216  *
217  * Use the mean and mean deviation of RTT for the appropriate type
218  * of RPC for the frequent RPCs and a default for the others.
219  * The justification for doing "other" this way is that these RPCs
220  * happen so infrequently that timer est. would probably be stale.
221  * Also, since many of these RPCs are non-idempotent, a conservative
222  * timeout is desired.
223  *
224  * getattr, lookup - A+2D
225  * read, write     - A+4D
226  * other           - nm_timeo
227  */
228 static int
229 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum)
230 {
231         enum nfs_rto_timer_t timer = nfs_rto_timer(procnum);
232         int index = timer - 1;
233         int rto;
234 
235         switch (timer) {
236         case NFS_GETATTR_TIMER:
237         case NFS_LOOKUP_TIMER:
238                 rto = ((nmp->nm_srtt[index] + 3) >> 2) +
239                                 ((nmp->nm_sdrtt[index] + 1) >> 1);
240                 break;
241         case NFS_READ_TIMER:
242         case NFS_WRITE_TIMER:
243                 rto = ((nmp->nm_srtt[index] + 7) >> 3) +
244                                 (nmp->nm_sdrtt[index] + 1);
245                 break;
246         default:
247                 rto = nmp->nm_timeo;
248                 return (rto);
249         }
250 
251         if (rto < NFS_MINRTO)
252                 rto = NFS_MINRTO;
253         else if (rto > NFS_MAXRTO)
254                 rto = NFS_MAXRTO;
255 
256         return (rto);
257 }
258 
259 
260 /*
261  * Initialize sockets and congestion for a new NFS connection.
262  * We do not free the sockaddr if error.
263  */
264 int
265 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
266 {
267         struct socket *so;
268         int error, rcvreserve, sndreserve;
269         int pktscale;
270         struct sockaddr *saddr;
271         struct ucred *origcred;
272         struct thread *td = curthread;
273 
274         /*
275          * We need to establish the socket using the credentials of
276          * the mountpoint.  Some parts of this process (such as
277          * sobind() and soconnect()) will use the curent thread's
278          * credential instead of the socket credential.  To work
279          * around this, temporarily change the current thread's
280          * credential to that of the mountpoint.
281          *
282          * XXX: It would be better to explicitly pass the correct
283          * credential to sobind() and soconnect().
284          */
285         origcred = td->td_ucred;
286         td->td_ucred = nmp->nm_mountp->mnt_cred;
287 
288         if (nmp->nm_sotype == SOCK_STREAM) {
289                 mtx_lock(&nmp->nm_mtx);
290                 nmp->nm_nfstcpstate.flags |= NFS_TCP_EXPECT_RPCMARKER;
291                 nmp->nm_nfstcpstate.rpcresid = 0;
292                 mtx_unlock(&nmp->nm_mtx);
293         }       
294         nmp->nm_so = NULL;
295         saddr = nmp->nm_nam;
296         error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
297                 nmp->nm_soproto, nmp->nm_mountp->mnt_cred, td);
298         if (error)
299                 goto bad;
300         so = nmp->nm_so;
301         nmp->nm_soflags = so->so_proto->pr_flags;
302 
303         /*
304          * Some servers require that the client port be a reserved port number.
305          */
306         if (nmp->nm_flag & NFSMNT_RESVPORT) {
307                 struct sockopt sopt;
308                 int ip, ip2, len;
309                 struct sockaddr_in6 ssin;
310                 struct sockaddr *sa;
311 
312                 bzero(&sopt, sizeof sopt);
313                 switch(saddr->sa_family) {
314                 case AF_INET:
315                         sopt.sopt_level = IPPROTO_IP;
316                         sopt.sopt_name = IP_PORTRANGE;
317                         ip = IP_PORTRANGE_LOW;
318                         ip2 = IP_PORTRANGE_DEFAULT;
319                         len = sizeof (struct sockaddr_in);
320                         break;
321 #ifdef INET6
322                 case AF_INET6:
323                         sopt.sopt_level = IPPROTO_IPV6;
324                         sopt.sopt_name = IPV6_PORTRANGE;
325                         ip = IPV6_PORTRANGE_LOW;
326                         ip2 = IPV6_PORTRANGE_DEFAULT;
327                         len = sizeof (struct sockaddr_in6);
328                         break;
329 #endif
330                 default:
331                         goto noresvport;
332                 }
333                 sa = (struct sockaddr *)&ssin;
334                 bzero(sa, len);
335                 sa->sa_len = len;
336                 sa->sa_family = saddr->sa_family;
337                 sopt.sopt_dir = SOPT_SET;
338                 sopt.sopt_val = (void *)&ip;
339                 sopt.sopt_valsize = sizeof(ip);
340                 error = sosetopt(so, &sopt);
341                 if (error)
342                         goto bad;
343                 error = sobind(so, sa, td);
344                 if (error)
345                         goto bad;
346                 ip = ip2;
347                 error = sosetopt(so, &sopt);
348                 if (error)
349                         goto bad;
350         noresvport: ;
351         }
352 
353         /*
354          * Protocols that do not require connections may be optionally left
355          * unconnected for servers that reply from a port other than NFS_PORT.
356          */
357         mtx_lock(&nmp->nm_mtx);
358         if (nmp->nm_flag & NFSMNT_NOCONN) {
359                 if (nmp->nm_soflags & PR_CONNREQUIRED) {
360                         error = ENOTCONN;
361                         mtx_unlock(&nmp->nm_mtx);
362                         goto bad;
363                 } else
364                         mtx_unlock(&nmp->nm_mtx);
365         } else {
366                 mtx_unlock(&nmp->nm_mtx);
367                 error = soconnect(so, nmp->nm_nam, td);
368                 if (error)
369                         goto bad;
370 
371                 /*
372                  * Wait for the connection to complete. Cribbed from the
373                  * connect system call but with the wait timing out so
374                  * that interruptible mounts don't hang here for a long time.
375                  */
376                 SOCK_LOCK(so);
377                 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
378                         (void) msleep(&so->so_timeo, SOCK_MTX(so),
379                             PSOCK, "nfscon", 2 * hz);
380                         if ((so->so_state & SS_ISCONNECTING) &&
381                             so->so_error == 0 && rep &&
382                             (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0) {
383                                 so->so_state &= ~SS_ISCONNECTING;
384                                 SOCK_UNLOCK(so);
385                                 goto bad;
386                         }
387                 }
388                 if (so->so_error) {
389                         error = so->so_error;
390                         so->so_error = 0;
391                         SOCK_UNLOCK(so);
392                         goto bad;
393                 }
394                 SOCK_UNLOCK(so);
395         }
396         so->so_rcv.sb_timeo = 12 * hz;
397         if (nmp->nm_sotype == SOCK_STREAM)
398                 so->so_snd.sb_timeo = 1 * hz;   /* 1s snd timeout for NFS/TCP */
399         else
400                 so->so_snd.sb_timeo = 5 * hz;
401 
402         /*
403          * Get buffer reservation size from sysctl, but impose reasonable
404          * limits.
405          */
406         pktscale = nfs_bufpackets;
407         if (pktscale < 2)
408                 pktscale = 2;
409         if (pktscale > 64)
410                 pktscale = 64;
411         mtx_lock(&nmp->nm_mtx);
412         if (nmp->nm_sotype == SOCK_DGRAM) {
413                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
414                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
415                     NFS_MAXPKTHDR) * pktscale;
416         } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
417                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
418                 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
419                     NFS_MAXPKTHDR) * pktscale;
420         } else {
421                 if (nmp->nm_sotype != SOCK_STREAM)
422                         panic("nfscon sotype");
423                 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
424                         struct sockopt sopt;
425                         int val;
426 
427                         bzero(&sopt, sizeof sopt);
428                         sopt.sopt_dir = SOPT_SET;
429                         sopt.sopt_level = SOL_SOCKET;
430                         sopt.sopt_name = SO_KEEPALIVE;
431                         sopt.sopt_val = &val;
432                         sopt.sopt_valsize = sizeof val;
433                         val = 1;
434                         mtx_unlock(&nmp->nm_mtx);
435                         sosetopt(so, &sopt);
436                         mtx_lock(&nmp->nm_mtx);
437                 }
438                 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
439                         struct sockopt sopt;
440                         int val;
441 
442                         bzero(&sopt, sizeof sopt);
443                         sopt.sopt_dir = SOPT_SET;
444                         sopt.sopt_level = IPPROTO_TCP;
445                         sopt.sopt_name = TCP_NODELAY;
446                         sopt.sopt_val = &val;
447                         sopt.sopt_valsize = sizeof val;
448                         val = 1;
449                         mtx_unlock(&nmp->nm_mtx);
450                         sosetopt(so, &sopt);
451                         mtx_lock(&nmp->nm_mtx);
452                 }
453                 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
454                     sizeof (u_int32_t)) * pktscale;
455                 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
456                     sizeof (u_int32_t)) * pktscale;
457         }
458         mtx_unlock(&nmp->nm_mtx);
459         error = soreserve(so, sndreserve, rcvreserve);
460         if (error)
461                 goto bad;
462         SOCKBUF_LOCK(&so->so_rcv);
463         so->so_rcv.sb_flags |= SB_NOINTR;
464         so->so_upcallarg = (caddr_t)nmp;
465         if (so->so_type == SOCK_STREAM)
466                 so->so_upcall = nfs_clnt_tcp_soupcall;
467         else    
468                 so->so_upcall = nfs_clnt_udp_soupcall;
469         so->so_rcv.sb_flags |= SB_UPCALL;
470         SOCKBUF_UNLOCK(&so->so_rcv);
471         SOCKBUF_LOCK(&so->so_snd);
472         so->so_snd.sb_flags |= SB_NOINTR;
473         SOCKBUF_UNLOCK(&so->so_snd);
474 
475         /* Restore current thread's credentials. */
476         td->td_ucred = origcred;
477 
478         mtx_lock(&nmp->nm_mtx);
479         /* Initialize other non-zero congestion variables */
480         nfs_init_rtt(nmp);
481         nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
482         nmp->nm_sent = 0;
483         nmp->nm_timeouts = 0;
484         mtx_unlock(&nmp->nm_mtx);
485         return (0);
486 
487 bad:
488         /* Restore current thread's credentials. */
489         td->td_ucred = origcred;
490 
491         nfs_disconnect(nmp);
492         return (error);
493 }
494 
495 static void
496 nfs_wakup_reconnectors(struct nfsmount *nmp)
497 {
498         KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
499         if (--nmp->nm_nfstcpstate.sock_send_inprog == 0 &&
500             (nmp->nm_nfstcpstate.flags & NFS_TCP_WAIT_WRITE_DRAIN)) {
501                 nmp->nm_nfstcpstate.flags &= ~NFS_TCP_WAIT_WRITE_DRAIN;
502                 wakeup((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog);
503         }
504 }
505 
506 /*
507  * Reconnect routine:
508  * Called when a connection is broken on a reliable protocol.
509  * - clean up the old socket
510  * - nfs_connect() again
511  * - set R_MUSTRESEND for all outstanding requests on mount point
512  * If this fails the mount point is DEAD!
513  * nb: Must be called with the nfs_sndlock() set on the mount point.
514  */
515 static int
516 nfs_reconnect(struct nfsreq *rep)
517 {
518         struct nfsreq *rp;
519         struct nfsmount *nmp = rep->r_nmp;
520         int error;
521         int slpflag = 0;
522 
523         KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
524         if (nmp->nm_flag & NFSMNT_INT)
525                 slpflag = PCATCH;
526         /*
527          * Wait for any pending writes to this socket to drain (or timeout).
528          */
529         while (nmp->nm_nfstcpstate.sock_send_inprog > 0) {
530                 nmp->nm_nfstcpstate.flags |= NFS_TCP_WAIT_WRITE_DRAIN;
531                 error = msleep((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog,
532                                &nmp->nm_mtx, slpflag | (PZERO - 1), "nfscon", 0);               
533         }
534         /*
535          * Grab the nfs_connect_lock to serialize connects. 
536          * After grabbing the nfs_connect_lock, check if a reconnect is necessary or
537          * if someone else beat us to the connect !
538          */
539         error = nfs_connect_lock(rep);
540         if (error)
541                 goto unlock_exit;
542         if ((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) == 0)
543                 goto unlock_exit;
544         else
545                 mtx_unlock(&nmp->nm_mtx);
546 
547         nfs_reconnects++;
548         nfs_disconnect(nmp);
549         while ((error = nfs_connect(nmp, rep)) != 0) {
550                 if (error == ERESTART)
551                         error = EINTR;
552                 if (error == EIO || error == EINTR) {
553                         mtx_lock(&nmp->nm_mtx);
554                         goto unlock_exit;
555                 }
556                 (void) tsleep(&fake_wchan, PSOCK, "nfscon", hz);
557         }
558 
559         /*
560          * Clear the FORCE_RECONNECT flag only after the connect 
561          * succeeds. To prevent races between multiple processes 
562          * waiting on the mountpoint where the connection is being
563          * torn down. The first one to acquire the sndlock will 
564          * retry the connection. The others block on the sndlock
565          * until the connection is established successfully, and 
566          * then re-transmit the request.
567          */
568         mtx_lock(&nmp->nm_mtx);
569         nmp->nm_nfstcpstate.flags &= ~NFS_TCP_FORCE_RECONNECT;
570         nmp->nm_nfstcpstate.rpcresid = 0;
571         mtx_unlock(&nmp->nm_mtx);       
572 
573         /*
574          * Loop through outstanding request list and fix up all requests
575          * on old socket.
576          */
577         mtx_lock(&nfs_reqq_mtx);
578         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
579                 if (rp->r_nmp == nmp) {
580                         mtx_lock(&rp->r_mtx);                   
581                         rp->r_flags |= R_MUSTRESEND;
582                         mtx_unlock(&rp->r_mtx);
583                 }
584         }
585         mtx_unlock(&nfs_reqq_mtx);
586         mtx_lock(&nmp->nm_mtx);
587 unlock_exit:
588         nfs_connect_unlock(rep);
589         mtx_unlock(&nmp->nm_mtx);               
590         return (error);
591 }
592 
593 /*
594  * NFS disconnect. Clean up and unlink.
595  */
596 void
597 nfs_disconnect(struct nfsmount *nmp)
598 {
599         struct socket *so;
600 
601         mtx_lock(&nmp->nm_mtx);
602         if (nmp->nm_so) {
603                 so = nmp->nm_so;
604                 nmp->nm_so = NULL;
605                 mtx_unlock(&nmp->nm_mtx);
606                 SOCKBUF_LOCK(&so->so_rcv);
607                 so->so_upcallarg = NULL;
608                 so->so_upcall = NULL;
609                 so->so_rcv.sb_flags &= ~SB_UPCALL;
610                 SOCKBUF_UNLOCK(&so->so_rcv);
611                 soshutdown(so, SHUT_WR);
612                 soclose(so);
613         } else
614                 mtx_unlock(&nmp->nm_mtx);
615 }
616 
617 void
618 nfs_safedisconnect(struct nfsmount *nmp)
619 {
620         struct nfsreq dummyreq;
621 
622         bzero(&dummyreq, sizeof(dummyreq));
623         dummyreq.r_nmp = nmp;
624         nfs_disconnect(nmp);
625 }
626 
627 /*
628  * This is the nfs send routine. For connection based socket types, it
629  * must be called with an nfs_sndlock() on the socket.
630  * - return EINTR if the RPC is terminated, 0 otherwise
631  * - set R_MUSTRESEND if the send fails for any reason
632  * - do any cleanup required by recoverable socket errors (?)
633  */
634 int
635 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
636     struct nfsreq *rep)
637 {
638         struct sockaddr *sendnam;
639         int error, error2, soflags, flags;
640 
641         KASSERT(rep, ("nfs_send: called with rep == NULL"));
642 
643         error = nfs_sigintr(rep->r_nmp, rep, rep->r_td);
644         if (error) {
645                 m_freem(top);
646                 return (error);
647         }
648         mtx_lock(&rep->r_nmp->nm_mtx);
649         mtx_lock(&rep->r_mtx);
650         if ((so = rep->r_nmp->nm_so) == NULL) {
651                 rep->r_flags |= R_MUSTRESEND;
652                 mtx_unlock(&rep->r_mtx);
653                 mtx_unlock(&rep->r_nmp->nm_mtx);
654                 m_freem(top);
655                 return (EPIPE);
656         }
657         rep->r_flags &= ~R_MUSTRESEND;
658         soflags = rep->r_nmp->nm_soflags;
659         mtx_unlock(&rep->r_mtx);
660         mtx_unlock(&rep->r_nmp->nm_mtx);
661 
662         if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
663                 sendnam = NULL;
664         else
665                 sendnam = nam;
666         if (so->so_type == SOCK_SEQPACKET)
667                 flags = MSG_EOR;
668         else
669                 flags = 0;
670 
671         error = sosend(so, sendnam, 0, top, 0, flags, curthread /*XXX*/);
672         if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
673                 error = 0;
674                 mtx_lock(&rep->r_mtx);
675                 rep->r_flags |= R_MUSTRESEND;
676                 mtx_unlock(&rep->r_mtx);
677         }
678 
679         if (error) {
680                 /*
681                  * Don't report EPIPE errors on nfs sockets.
682                  * These can be due to idle tcp mounts which will be closed by
683                  * netapp, solaris, etc. if left idle too long.
684                  */
685                 if (error != EPIPE) {
686                         log(LOG_INFO, "nfs send error %d for server %s\n",
687                             error,
688                             rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
689                 }
690                 /*
691                  * Deal with errors for the client side.
692                  */
693                 error2 = NFS_SIGREP(rep);
694                 if (error2)
695                         error = error2;
696                 else {
697                         mtx_lock(&rep->r_mtx);
698                         rep->r_flags |= R_MUSTRESEND;
699                         mtx_unlock(&rep->r_mtx);
700                 }
701 
702                 /*
703                  * Handle any recoverable (soft) socket errors here. (?)
704                  * Make EWOULDBLOCK a recoverable error, we'll rexmit from nfs_timer().
705                  */
706                 if (error != EINTR && error != ERESTART && error != EIO && error != EPIPE)
707                         error = 0;
708         }
709         return (error);
710 }
711 
712 int
713 nfs_reply(struct nfsreq *rep)
714 {
715         register struct socket *so;
716         register struct mbuf *m;
717         int error = 0, sotype, slpflag;
718         struct nfsmount *nmp = rep->r_nmp;
719         
720         sotype = nmp->nm_sotype;
721         /*
722          * For reliable protocols, lock against other senders/receivers
723          * in case a reconnect is necessary.
724          */
725         if (sotype != SOCK_DGRAM) {
726 tryagain:
727                 mtx_lock(&nmp->nm_mtx);
728                 mtx_lock(&rep->r_mtx);
729                 if (rep->r_mrep) {
730                         mtx_unlock(&rep->r_mtx);
731                         mtx_unlock(&nmp->nm_mtx);
732                         return (0);
733                 }
734                 if (rep->r_flags & R_SOFTTERM) {
735                         mtx_unlock(&rep->r_mtx);
736                         mtx_unlock(&nmp->nm_mtx);
737                         return (EINTR);
738                 }
739                 so = nmp->nm_so;
740                 if (!so || 
741                     (nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
742                         mtx_unlock(&rep->r_mtx);
743                         nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
744                         error = nfs_reconnect(rep);
745                         if (error)
746                                 return (error);
747                         goto tryagain;
748                 }
749                 while (rep->r_flags & R_MUSTRESEND) {
750                         mtx_unlock(&rep->r_mtx);
751                         nmp->nm_nfstcpstate.sock_send_inprog++;
752                         mtx_unlock(&nmp->nm_mtx);
753                         m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
754                         nfsstats.rpcretries++;
755                         error = nfs_send(so, nmp->nm_nam, m, rep);
756                         if (error) {
757                                 mtx_lock(&nmp->nm_mtx);
758                                 nfs_wakup_reconnectors(nmp);
759                                 if (!(error == EINTR || error == ERESTART)) {
760                                         nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
761                                         error = nfs_reconnect(rep);
762                                 } else
763                                         mtx_unlock(&nmp->nm_mtx);
764                                 if (error)
765                                         return (error);
766                                 goto tryagain;
767                         } else {
768                                 mtx_lock(&nmp->nm_mtx);
769                                 nfs_wakup_reconnectors(nmp);
770                                 mtx_lock(&rep->r_mtx);
771                         }
772                 }
773                 mtx_unlock(&rep->r_mtx);
774                 mtx_unlock(&nmp->nm_mtx);
775         }
776         slpflag = 0;
777         mtx_lock(&nmp->nm_mtx);
778         if (nmp->nm_flag & NFSMNT_INT)
779                 slpflag = PCATCH;
780         mtx_unlock(&nmp->nm_mtx);
781         mtx_lock(&rep->r_mtx);
782         while ((rep->r_mrep == NULL) && (error == 0) && 
783                ((rep->r_flags & R_SOFTTERM) == 0) &&
784                ((sotype == SOCK_DGRAM) || ((rep->r_flags & R_MUSTRESEND) == 0)))
785                 error = msleep((caddr_t)rep, &rep->r_mtx, 
786                                slpflag | (PZERO - 1), "nfsreq", 0);
787         if (error == EINTR || error == ERESTART) {
788                 /* NFS operations aren't restartable. Map ERESTART to EINTR */
789                 mtx_unlock(&rep->r_mtx);
790                 return (EINTR);
791         }
792         if (rep->r_flags & R_SOFTTERM) {
793                 /* Request was terminated because we exceeded the retries (soft mount) */
794                 mtx_unlock(&rep->r_mtx);
795                 return (ETIMEDOUT);
796         }
797         mtx_unlock(&rep->r_mtx);
798         if (sotype == SOCK_STREAM) {
799                 mtx_lock(&nmp->nm_mtx);
800                 mtx_lock(&rep->r_mtx);
801                 if (((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) || 
802                      (rep->r_flags & R_MUSTRESEND))) {
803                         mtx_unlock(&rep->r_mtx);
804                         mtx_unlock(&nmp->nm_mtx);       
805                         goto tryagain;
806                 } else {
807                         mtx_unlock(&rep->r_mtx);
808                         mtx_unlock(&nmp->nm_mtx);       
809                 }
810         }
811         return (error);
812 }
813 
814 /*
815  * XXX TO DO
816  * Make nfs_realign() non-blocking. Also make nfsm_dissect() nonblocking.
817  */
818 static void
819 nfs_clnt_match_xid(struct socket *so, 
820                    struct nfsmount *nmp, 
821                    struct mbuf *mrep)
822 {
823         struct mbuf *md;
824         caddr_t dpos;
825         u_int32_t rxid, *tl;
826         struct nfsreq *rep;
827         int error;
828         
829         /*
830          * Search for any mbufs that are not a multiple of 4 bytes long
831          * or with m_data not longword aligned.
832          * These could cause pointer alignment problems, so copy them to
833          * well aligned mbufs.
834          */
835         if (nfs_realign(&mrep, 5 * NFSX_UNSIGNED) == ENOMEM) {
836                 m_freem(mrep);
837                 nfsstats.rpcinvalid++;
838                 return;
839         }
840         
841         /*
842          * Get the xid and check that it is an rpc reply
843          */
844         md = mrep;
845         dpos = mtod(md, caddr_t);
846         tl = nfsm_dissect_nonblock(u_int32_t *, 2*NFSX_UNSIGNED);
847         rxid = *tl++;
848         if (*tl != rpc_reply) {
849                 m_freem(mrep);
850 nfsmout:
851                 nfsstats.rpcinvalid++;
852                 return;
853         }
854 
855         mtx_lock(&nfs_reqq_mtx);
856         /*
857          * Loop through the request list to match up the reply
858          * Iff no match, just drop the datagram
859          */
860         TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
861                 mtx_lock(&nmp->nm_mtx);
862                 mtx_lock(&rep->r_mtx);
863                 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
864                         /* Found it.. */
865                         rep->r_mrep = mrep;
866                         rep->r_md = md;
867                         rep->r_dpos = dpos;
868                         /*
869                          * Update congestion window.
870                          * Do the additive increase of
871                          * one rpc/rtt.
872                          */
873                         if (nmp->nm_cwnd <= nmp->nm_sent) {
874                                 nmp->nm_cwnd +=
875                                         (NFS_CWNDSCALE * NFS_CWNDSCALE +
876                                          (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
877                                 if (nmp->nm_cwnd > NFS_MAXCWND)
878                                         nmp->nm_cwnd = NFS_MAXCWND;
879                         }       
880                         if (rep->r_flags & R_SENT) {
881                                 rep->r_flags &= ~R_SENT;
882                                 nmp->nm_sent -= NFS_CWNDSCALE;
883                         }
884                         if (rep->r_flags & R_TIMING)
885                                 nfs_update_rtt(rep);
886                         nmp->nm_timeouts = 0;
887                         wakeup((caddr_t)rep);
888                         mtx_unlock(&rep->r_mtx);
889                         mtx_unlock(&nmp->nm_mtx);
890                         break;
891                 }
892                 mtx_unlock(&rep->r_mtx);
893                 mtx_unlock(&nmp->nm_mtx);
894         }
895         /*
896          * If not matched to a request, drop it.
897          * If it's mine, wake up requestor.
898          */
899         if (rep == 0) {
900                 nfsstats.rpcunexpected++;
901                 m_freem(mrep);
902         }
903         mtx_unlock(&nfs_reqq_mtx);
904 }
905 
906 static void
907 nfs_mark_for_reconnect(struct nfsmount *nmp)
908 {
909         struct nfsreq *rp;
910 
911         mtx_lock(&nmp->nm_mtx);
912         nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
913         mtx_unlock(&nmp->nm_mtx);
914         /* 
915          * Wakeup all processes that are waiting for replies 
916          * on this mount point. One of them does the reconnect.
917          */
918         mtx_lock(&nfs_reqq_mtx);
919         TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
920                 if (rp->r_nmp == nmp) {
921                         mtx_lock(&rp->r_mtx);
922                         rp->r_flags |= R_MUSTRESEND;
923                         wakeup((caddr_t)rp);
924                         mtx_unlock(&rp->r_mtx);
925                 }
926         }
927         mtx_unlock(&nfs_reqq_mtx);
928 }
929 
930 static int
931 nfstcp_readable(struct socket *so, int bytes)
932 {
933         int retval;
934         
935         SOCKBUF_LOCK(&so->so_rcv);
936         retval = (so->so_rcv.sb_cc >= (bytes) ||
937                   (so->so_rcv.sb_state & SBS_CANTRCVMORE) ||
938                   so->so_error);
939         SOCKBUF_UNLOCK(&so->so_rcv);
940         return (retval);